650 files changed, 39455 insertions, 19310 deletions
diff --git a/net/802/fc.c b/net/802/fc.c
index 34cf1ee014b..1e49f2d4ea9 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -70,7 +70,7 @@ static int fc_header(struct sk_buff *skb, struct net_device *dev,
 	if(daddr)
 	{
 		memcpy(fch->daddr,daddr,dev->addr_len);
-		return(hdr_len);
+		return hdr_len;
 	}
 	return -hdr_len;
 }
diff --git a/net/802/fddi.c b/net/802/fddi.c
index 3ef0ab0a543..94b3ad08f39 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -82,10 +82,10 @@ static int fddi_header(struct sk_buff *skb, struct net_device *dev,
 	if (daddr != NULL)
 	{
 		memcpy(fddi->daddr, daddr, dev->addr_len);
-		return(hl);
+		return hl;
 	}
 
-	return(-hl);
+	return -hl;
 }
 
 
@@ -108,7 +108,7 @@ static int fddi_rebuild_header(struct sk_buff	*skb)
 	{
 		printk("%s: Don't know how to resolve type %04X addresses.\n",
 		       skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype));
-		return(0);
+		return 0;
 	}
 }
 
@@ -162,7 +162,7 @@ __be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
 
 	/* Assume 802.2 SNAP frames, for now */
 
-	return(type);
+	return type;
 }
 
 EXPORT_SYMBOL(fddi_type_trans);
@@ -170,9 +170,9 @@ EXPORT_SYMBOL(fddi_type_trans);
 int fddi_change_mtu(struct net_device *dev, int new_mtu)
 {
 	if ((new_mtu < FDDI_K_SNAP_HLEN) || (new_mtu > FDDI_K_SNAP_DLEN))
-		return(-EINVAL);
+		return -EINVAL;
 	dev->mtu = new_mtu;
-	return(0);
+	return 0;
 }
 EXPORT_SYMBOL(fddi_change_mtu);
 
diff --git a/net/802/garp.c b/net/802/garp.c
index 941f2a324d3..c1df2dad8c6 100644
--- a/net/802/garp.c
+++ b/net/802/garp.c
@@ -346,8 +346,8 @@ int garp_request_join(const struct net_device *dev,
 		      const struct garp_application *appl,
 		      const void *data, u8 len, u8 type)
 {
-	struct garp_port *port = dev->garp_port;
-	struct garp_applicant *app = port->applicants[appl->type];
+	struct garp_port *port = rtnl_dereference(dev->garp_port);
+	struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]);
 	struct garp_attr *attr;
 
 	spin_lock_bh(&app->lock);
@@ -366,8 +366,8 @@ void garp_request_leave(const struct net_device *dev,
 			const struct garp_application *appl,
 			const void *data, u8 len, u8 type)
 {
-	struct garp_port *port = dev->garp_port;
-	struct garp_applicant *app = port->applicants[appl->type];
+	struct garp_port *port = rtnl_dereference(dev->garp_port);
+	struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]);
 	struct garp_attr *attr;
 
 	spin_lock_bh(&app->lock);
@@ -546,11 +546,11 @@ static int garp_init_port(struct net_device *dev)
 
 static void garp_release_port(struct net_device *dev)
 {
-	struct garp_port *port = dev->garp_port;
+	struct garp_port *port = rtnl_dereference(dev->garp_port);
 	unsigned int i;
 
 	for (i = 0; i <= GARP_APPLICATION_MAX; i++) {
-		if (port->applicants[i])
+		if (rtnl_dereference(port->applicants[i]))
 			return;
 	}
 	rcu_assign_pointer(dev->garp_port, NULL);
@@ -565,7 +565,7 @@ int garp_init_applicant(struct net_device *dev, struct garp_application *appl)
 
 	ASSERT_RTNL();
 
-	if (!dev->garp_port) {
+	if (!rtnl_dereference(dev->garp_port)) {
 		err = garp_init_port(dev);
 		if (err < 0)
 			goto err1;
@@ -601,8 +601,8 @@ EXPORT_SYMBOL_GPL(garp_init_applicant);
 
 void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl)
 {
-	struct garp_port *port = dev->garp_port;
-	struct garp_applicant *app = port->applicants[appl->type];
+	struct garp_port *port = rtnl_dereference(dev->garp_port);
+	struct garp_applicant *app = rtnl_dereference(port->applicants[appl->type]);
 
 	ASSERT_RTNL();
 
diff --git a/net/802/hippi.c b/net/802/hippi.c
index cd3e8e92952..91aca8780fd 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -152,7 +152,7 @@ int hippi_change_mtu(struct net_device *dev, int new_mtu)
 	if ((new_mtu < 68) || (new_mtu > 65280))
 		return -EINVAL;
 	dev->mtu = new_mtu;
-	return(0);
+	return 0;
 }
 EXPORT_SYMBOL(hippi_change_mtu);
 
diff --git a/net/802/stp.c b/net/802/stp.c
index 53c8f77f0cc..978c30b1b36 100644
--- a/net/802/stp.c
+++ b/net/802/stp.c
@@ -21,8 +21,8 @@
 #define GARP_ADDR_MAX	0x2F
 #define GARP_ADDR_RANGE	(GARP_ADDR_MAX - GARP_ADDR_MIN)
 
-static const struct stp_proto *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly;
-static const struct stp_proto *stp_proto __read_mostly;
+static const struct stp_proto __rcu *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly;
+static const struct stp_proto __rcu *stp_proto __read_mostly;
 
 static struct llc_sap *sap __read_mostly;
 static unsigned int sap_registered;
diff --git a/net/802/tr.c b/net/802/tr.c
index 1c6e596074d..5e20cf8a074 100644
--- a/net/802/tr.c
+++ b/net/802/tr.c
@@ -145,7 +145,7 @@ static int tr_header(struct sk_buff *skb, struct net_device *dev,
 	{
 		memcpy(trh->daddr,daddr,dev->addr_len);
 		tr_source_route(skb, trh, dev);
-		return(hdr_len);
+		return hdr_len;
 	}
 
 	return -hdr_len;
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 3c1c8c14e92..52077ca2207 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -44,9 +44,6 @@
 
 int vlan_net_id __read_mostly;
 
-/* Our listing of VLAN group(s) */
-static struct hlist_head vlan_group_hash[VLAN_GRP_HASH_SIZE];
-
 const char vlan_fullname[] = "802.1Q VLAN Support";
 const char vlan_version[] = DRV_VERSION;
 static const char vlan_copyright[] = "Ben Greear <greearb@candelatech.com>";
@@ -59,40 +56,6 @@ static struct packet_type vlan_packet_type __read_mostly = {
 
 /* End of global variables definitions. */
 
-static inline unsigned int vlan_grp_hashfn(unsigned int idx)
-{
-	return ((idx >> VLAN_GRP_HASH_SHIFT) ^ idx) & VLAN_GRP_HASH_MASK;
-}
-
-/* Must be invoked with RCU read lock (no preempt) */
-static struct vlan_group *__vlan_find_group(struct net_device *real_dev)
-{
-	struct vlan_group *grp;
-	struct hlist_node *n;
-	int hash = vlan_grp_hashfn(real_dev->ifindex);
-
-	hlist_for_each_entry_rcu(grp, n, &vlan_group_hash[hash], hlist) {
-		if (grp->real_dev == real_dev)
-			return grp;
-	}
-
-	return NULL;
-}
-
-/*  Find the protocol handler.  Assumes VID < VLAN_VID_MASK.
- *
- * Must be invoked with RCU read lock (no preempt)
- */
-struct net_device *__find_vlan_dev(struct net_device *real_dev, u16 vlan_id)
-{
-	struct vlan_group *grp = __vlan_find_group(real_dev);
-
-	if (grp)
-		return vlan_group_get_device(grp, vlan_id);
-
-	return NULL;
-}
-
 static void vlan_group_free(struct vlan_group *grp)
 {
 	int i;
@@ -111,8 +74,6 @@ static struct vlan_group *vlan_group_alloc(struct net_device *real_dev)
 		return NULL;
 
 	grp->real_dev = real_dev;
-	hlist_add_head_rcu(&grp->hlist,
-			&vlan_group_hash[vlan_grp_hashfn(real_dev->ifindex)]);
 	return grp;
 }
 
@@ -151,13 +112,14 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 
 	ASSERT_RTNL();
 
-	grp = __vlan_find_group(real_dev);
+	grp = rtnl_dereference(real_dev->vlgrp);
 	BUG_ON(!grp);
 
 	/* Take it out of our own structures, but be sure to interlock with
-	 * HW accelerating devices or SW vlan input packet processing.
+	 * HW accelerating devices or SW vlan input packet processing if
+	 * VLAN is not 0 (leave it there for 802.1p).
 	 */
-	if (real_dev->features & NETIF_F_HW_VLAN_FILTER)
+	if (vlan_id && (real_dev->features & NETIF_F_HW_VLAN_FILTER))
 		ops->ndo_vlan_rx_kill_vid(real_dev, vlan_id);
 
 	grp->nr_vlans--;
@@ -172,11 +134,10 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 	if (grp->nr_vlans == 0) {
 		vlan_gvrp_uninit_applicant(real_dev);
 
-		if (real_dev->features & NETIF_F_HW_VLAN_RX)
+		rcu_assign_pointer(real_dev->vlgrp, NULL);
+		if (ops->ndo_vlan_rx_register)
 			ops->ndo_vlan_rx_register(real_dev, NULL);
 
-		hlist_del_rcu(&grp->hlist);
-
 		/* Free the group, after all cpu's are done. */
 		call_rcu(&grp->rcu, vlan_rcu_free);
 	}
@@ -195,18 +156,13 @@ int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id)
 		return -EOPNOTSUPP;
 	}
 
-	if ((real_dev->features & NETIF_F_HW_VLAN_RX) && !ops->ndo_vlan_rx_register) {
-		pr_info("8021q: device %s has buggy VLAN hw accel\n", name);
-		return -EOPNOTSUPP;
-	}
-
 	if ((real_dev->features & NETIF_F_HW_VLAN_FILTER) &&
 	    (!ops->ndo_vlan_rx_add_vid || !ops->ndo_vlan_rx_kill_vid)) {
 		pr_info("8021q: Device %s has buggy VLAN hw accel\n", name);
 		return -EOPNOTSUPP;
 	}
 
-	if (__find_vlan_dev(real_dev, vlan_id) != NULL)
+	if (vlan_find_dev(real_dev, vlan_id) != NULL)
 		return -EEXIST;
 
 	return 0;
@@ -221,7 +177,7 @@ int register_vlan_dev(struct net_device *dev)
 	struct vlan_group *grp, *ngrp = NULL;
 	int err;
 
-	grp = __vlan_find_group(real_dev);
+	grp = rtnl_dereference(real_dev->vlgrp);
 	if (!grp) {
 		ngrp = grp = vlan_group_alloc(real_dev);
 		if (!grp)
@@ -251,8 +207,11 @@ int register_vlan_dev(struct net_device *dev)
 	vlan_group_set_device(grp, vlan_id, dev);
 	grp->nr_vlans++;
 
-	if (ngrp && real_dev->features & NETIF_F_HW_VLAN_RX)
-		ops->ndo_vlan_rx_register(real_dev, ngrp);
+	if (ngrp) {
+		if (ops->ndo_vlan_rx_register)
+			ops->ndo_vlan_rx_register(real_dev, ngrp);
+		rcu_assign_pointer(real_dev->vlgrp, ngrp);
+	}
 	if (real_dev->features & NETIF_F_HW_VLAN_FILTER)
 		ops->ndo_vlan_rx_add_vid(real_dev, vlan_id);
 
@@ -263,7 +222,6 @@ out_uninit_applicant:
 		vlan_gvrp_uninit_applicant(real_dev);
 out_free_group:
 	if (ngrp) {
-		hlist_del_rcu(&ngrp->hlist);
 		/* Free the group, after all cpu's are done. */
 		call_rcu(&ngrp->rcu, vlan_rcu_free);
 	}
@@ -320,7 +278,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
 	if (new_dev == NULL)
 		return -ENOBUFS;
 
-	new_dev->real_num_tx_queues = real_dev->real_num_tx_queues;
+	netif_copy_real_num_queues(new_dev, real_dev);
 	dev_net_set(new_dev, net);
 	/* need 4 bytes for extra VLAN header info,
 	 * hope the underlying device can handle it.
@@ -419,7 +377,15 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 	if (is_vlan_dev(dev))
 		__vlan_device_event(dev, event);
 
-	grp = __vlan_find_group(dev);
+	if ((event == NETDEV_UP) &&
+	    (dev->features & NETIF_F_HW_VLAN_FILTER) &&
+	    dev->netdev_ops->ndo_vlan_rx_add_vid) {
+		pr_info("8021q: adding VLAN 0 to HW filter on device %s\n",
+			dev->name);
+		dev->netdev_ops->ndo_vlan_rx_add_vid(dev, 0);
+	}
+
+	grp = rtnl_dereference(dev->vlgrp);
 	if (!grp)
 		goto out;
 
@@ -430,7 +396,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 	switch (event) {
 	case NETDEV_CHANGE:
 		/* Propagate real device state to vlan devices */
-		for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+		for (i = 0; i < VLAN_N_VID; i++) {
 			vlandev = vlan_group_get_device(grp, i);
 			if (!vlandev)
 				continue;
@@ -441,7 +407,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 
 	case NETDEV_CHANGEADDR:
 		/* Adjust unicast filters on underlying device */
-		for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+		for (i = 0; i < VLAN_N_VID; i++) {
 			vlandev = vlan_group_get_device(grp, i);
 			if (!vlandev)
 				continue;
@@ -455,7 +421,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 		break;
 
 	case NETDEV_CHANGEMTU:
-		for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+		for (i = 0; i < VLAN_N_VID; i++) {
 			vlandev = vlan_group_get_device(grp, i);
 			if (!vlandev)
 				continue;
@@ -469,7 +435,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 
 	case NETDEV_FEAT_CHANGE:
 		/* Propagate device features to underlying device */
-		for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+		for (i = 0; i < VLAN_N_VID; i++) {
 			vlandev = vlan_group_get_device(grp, i);
 			if (!vlandev)
 				continue;
@@ -481,7 +447,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 
 	case NETDEV_DOWN:
 		/* Put all VLANs for this dev in the down state too.  */
-		for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+		for (i = 0; i < VLAN_N_VID; i++) {
 			vlandev = vlan_group_get_device(grp, i);
 			if (!vlandev)
 				continue;
@@ -499,7 +465,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 
 	case NETDEV_UP:
 		/* Put all VLANs for this dev in the up state too.  */
-		for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+		for (i = 0; i < VLAN_N_VID; i++) {
 			vlandev = vlan_group_get_device(grp, i);
 			if (!vlandev)
 				continue;
@@ -516,10 +482,14 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 		break;
 
 	case NETDEV_UNREGISTER:
+		/* twiddle thumbs on netns device moves */
+		if (dev->reg_state != NETREG_UNREGISTERING)
+			break;
+
 		/* Delete all VLANs for this dev. */
 		grp->killall = 1;
 
-		for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+		for (i = 0; i < VLAN_N_VID; i++) {
 			vlandev = vlan_group_get_device(grp, i);
 			if (!vlandev)
 				continue;
@@ -527,7 +497,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 			/* unregistration of last vlan destroys group, abort
 			 * afterwards */
 			if (grp->nr_vlans == 1)
-				i = VLAN_GROUP_ARRAY_LEN;
+				i = VLAN_N_VID;
 
 			unregister_vlan_dev(vlandev, &list);
 		}
@@ -733,8 +703,6 @@ err0:
 
 static void __exit vlan_cleanup_module(void)
 {
-	unsigned int i;
-
 	vlan_ioctl_set(NULL);
 	vlan_netlink_fini();
 
@@ -742,10 +710,6 @@ static void __exit vlan_cleanup_module(void)
 
 	dev_remove_pack(&vlan_packet_type);
 
-	/* This table must be empty if there are no module references left. */
-	for (i = 0; i < VLAN_GRP_HASH_SIZE; i++)
-		BUG_ON(!hlist_empty(&vlan_group_hash[i]));
-
 	unregister_pernet_subsys(&vlan_net_ops);
 	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 6abdcac1b2e..db01b3181fd 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -2,6 +2,7 @@
 #define __BEN_VLAN_802_1Q_INC__
 
 #include <linux/if_vlan.h>
+#include <linux/u64_stats_sync.h>
 
 
 /**
@@ -21,14 +22,16 @@ struct vlan_priority_tci_mapping {
  *	struct vlan_rx_stats - VLAN percpu rx stats
  *	@rx_packets: number of received packets
  *	@rx_bytes: number of received bytes
- *	@multicast: number of received multicast packets
+ *	@rx_multicast: number of received multicast packets
+ *	@syncp: synchronization point for 64bit counters
  *	@rx_errors: number of errors
  */
 struct vlan_rx_stats {
-	unsigned long rx_packets;
-	unsigned long rx_bytes;
-	unsigned long multicast;
-	unsigned long rx_errors;
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			rx_multicast;
+	struct u64_stats_sync	syncp;
+	unsigned long		rx_errors;
 };
 
 /**
@@ -69,23 +72,6 @@ static inline struct vlan_dev_info *vlan_dev_info(const struct net_device *dev)
 	return netdev_priv(dev);
 }
 
-#define VLAN_GRP_HASH_SHIFT	5
-#define VLAN_GRP_HASH_SIZE	(1 << VLAN_GRP_HASH_SHIFT)
-#define VLAN_GRP_HASH_MASK	(VLAN_GRP_HASH_SIZE - 1)
-
-/*  Find a VLAN device by the MAC address of its Ethernet device, and
- *  it's VLAN ID.  The default configuration is to have VLAN's scope
- *  to be box-wide, so the MAC will be ignored.  The mac will only be
- *  looked at if we are configured to have a separate set of VLANs per
- *  each MAC addressable interface.  Note that this latter option does
- *  NOT follow the spec for VLANs, but may be useful for doing very
- *  large quantities of VLAN MUX/DEMUX onto FrameRelay or ATM PVCs.
- *
- *  Must be invoked with rcu_read_lock (ie preempt disabled)
- *  or with RTNL.
- */
-struct net_device *__find_vlan_dev(struct net_device *real_dev, u16 vlan_id);
-
 /* found in vlan_dev.c */
 int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
 		  struct packet_type *ptype, struct net_device *orig_dev);
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 50f58f5f1c3..69b2f79800a 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -4,46 +4,31 @@
 #include <linux/netpoll.h>
 #include "vlan.h"
 
-/* VLAN rx hw acceleration helper.  This acts like netif_{rx,receive_skb}(). */
-int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
-		      u16 vlan_tci, int polling)
+bool vlan_hwaccel_do_receive(struct sk_buff **skbp)
 {
-	if (netpoll_rx(skb))
-		return NET_RX_DROP;
-
-	if (skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master)))
-		skb->deliver_no_wcard = 1;
-
-	skb->skb_iif = skb->dev->ifindex;
-	__vlan_hwaccel_put_tag(skb, vlan_tci);
-	skb->dev = vlan_group_get_device(grp, vlan_tci & VLAN_VID_MASK);
-
-	if (!skb->dev)
-		goto drop;
-
-	return (polling ? netif_receive_skb(skb) : netif_rx(skb));
-
-drop:
-	dev_kfree_skb_any(skb);
-	return NET_RX_DROP;
-}
-EXPORT_SYMBOL(__vlan_hwaccel_rx);
-
-int vlan_hwaccel_do_receive(struct sk_buff *skb)
-{
-	struct net_device *dev = skb->dev;
-	struct vlan_rx_stats     *rx_stats;
+	struct sk_buff *skb = *skbp;
+	u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK;
+	struct net_device *vlan_dev;
+	struct vlan_rx_stats *rx_stats;
+
+	vlan_dev = vlan_find_dev(skb->dev, vlan_id);
+	if (!vlan_dev) {
+		if (vlan_id)
+			skb->pkt_type = PACKET_OTHERHOST;
+		return false;
+	}
 
-	skb->dev = vlan_dev_info(dev)->real_dev;
-	netif_nit_deliver(skb);
+	skb = *skbp = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return false;
 
-	skb->dev = dev;
-	skb->priority = vlan_get_ingress_priority(dev, skb->vlan_tci);
+	skb->dev = vlan_dev;
+	skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci);
 	skb->vlan_tci = 0;
 
-	rx_stats = per_cpu_ptr(vlan_dev_info(dev)->vlan_rx_stats,
-			       smp_processor_id());
+	rx_stats = this_cpu_ptr(vlan_dev_info(vlan_dev)->vlan_rx_stats);
 
+	u64_stats_update_begin(&rx_stats->syncp);
 	rx_stats->rx_packets++;
 	rx_stats->rx_bytes += skb->len;
 
@@ -51,18 +36,20 @@ int vlan_hwaccel_do_receive(struct sk_buff *skb)
 	case PACKET_BROADCAST:
 		break;
 	case PACKET_MULTICAST:
-		rx_stats->multicast++;
+		rx_stats->rx_multicast++;
 		break;
 	case PACKET_OTHERHOST:
 		/* Our lower layer thinks this is not local, let's make sure.
 		 * This allows the VLAN to have a different MAC than the
 		 * underlying device, and still route correctly. */
 		if (!compare_ether_addr(eth_hdr(skb)->h_dest,
-					dev->dev_addr))
+					vlan_dev->dev_addr))
 			skb->pkt_type = PACKET_HOST;
 		break;
 	}
-	return 0;
+	u64_stats_update_end(&rx_stats->syncp);
+
+	return true;
 }
 
 struct net_device *vlan_dev_real_dev(const struct net_device *dev)
@@ -77,63 +64,27 @@ u16 vlan_dev_vlan_id(const struct net_device *dev)
 }
 EXPORT_SYMBOL(vlan_dev_vlan_id);
 
-static gro_result_t
-vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
-		unsigned int vlan_tci, struct sk_buff *skb)
+/* VLAN rx hw acceleration helper.  This acts like netif_{rx,receive_skb}(). */
+int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
+		      u16 vlan_tci, int polling)
 {
-	struct sk_buff *p;
-
-	if (skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master)))
-		skb->deliver_no_wcard = 1;
-
-	skb->skb_iif = skb->dev->ifindex;
 	__vlan_hwaccel_put_tag(skb, vlan_tci);
-	skb->dev = vlan_group_get_device(grp, vlan_tci & VLAN_VID_MASK);
-
-	if (!skb->dev)
-		goto drop;
-
-	for (p = napi->gro_list; p; p = p->next) {
-		NAPI_GRO_CB(p)->same_flow =
-			p->dev == skb->dev && !compare_ether_header(
-				skb_mac_header(p), skb_gro_mac_header(skb));
-		NAPI_GRO_CB(p)->flush = 0;
-	}
-
-	return dev_gro_receive(napi, skb);
-
-drop:
-	return GRO_DROP;
+	return polling ? netif_receive_skb(skb) : netif_rx(skb);
 }
+EXPORT_SYMBOL(__vlan_hwaccel_rx);
 
 gro_result_t vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
 			      unsigned int vlan_tci, struct sk_buff *skb)
 {
-	if (netpoll_rx_on(skb))
-		return vlan_hwaccel_receive_skb(skb, grp, vlan_tci)
-			? GRO_DROP : GRO_NORMAL;
-
-	skb_gro_reset_offset(skb);
-
-	return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb);
+	__vlan_hwaccel_put_tag(skb, vlan_tci);
+	return napi_gro_receive(napi, skb);
 }
 EXPORT_SYMBOL(vlan_gro_receive);
 
 gro_result_t vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
 			    unsigned int vlan_tci)
 {
-	struct sk_buff *skb = napi_frags_skb(napi);
-
-	if (!skb)
-		return GRO_DROP;
-
-	if (netpoll_rx_on(skb)) {
-		skb->protocol = eth_type_trans(skb, skb->dev);
-		return vlan_hwaccel_receive_skb(skb, grp, vlan_tci)
-			? GRO_DROP : GRO_NORMAL;
-	}
-
-	return napi_frags_finish(napi, skb,
-				 vlan_gro_common(napi, grp, vlan_tci, skb));
+	__vlan_hwaccel_put_tag(napi->skb, vlan_tci);
+	return napi_gro_frags(napi);
 }
 EXPORT_SYMBOL(vlan_gro_frags);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 52984267781..14e3d1fa07a 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -142,6 +142,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
 {
 	struct vlan_hdr *vhdr;
 	struct vlan_rx_stats *rx_stats;
+	struct net_device *vlan_dev;
 	u16 vlan_id;
 	u16 vlan_tci;
 
@@ -157,62 +158,82 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
 	vlan_id = vlan_tci & VLAN_VID_MASK;
 
 	rcu_read_lock();
-	skb->dev = __find_vlan_dev(dev, vlan_id);
-	if (!skb->dev) {
-		pr_debug("%s: ERROR: No net_device for VID: %u on dev: %s\n",
-			 __func__, vlan_id, dev->name);
-		goto err_unlock;
-	}
-
-	rx_stats = per_cpu_ptr(vlan_dev_info(skb->dev)->vlan_rx_stats,
-			       smp_processor_id());
-	rx_stats->rx_packets++;
-	rx_stats->rx_bytes += skb->len;
-
-	skb_pull_rcsum(skb, VLAN_HLEN);
-
-	skb->priority = vlan_get_ingress_priority(skb->dev, vlan_tci);
-
-	pr_debug("%s: priority: %u for TCI: %hu\n",
-		 __func__, skb->priority, vlan_tci);
-
-	switch (skb->pkt_type) {
-	case PACKET_BROADCAST: /* Yeah, stats collect these together.. */
-		/* stats->broadcast ++; // no such counter :-( */
-		break;
+	vlan_dev = vlan_find_dev(dev, vlan_id);
 
-	case PACKET_MULTICAST:
-		rx_stats->multicast++;
-		break;
+	/* If the VLAN device is defined, we use it.
+	 * If not, and the VID is 0, it is a 802.1p packet (not
+	 * really a VLAN), so we will just netif_rx it later to the
+	 * original interface, but with the skb->proto set to the
+	 * wrapped proto: we do nothing here.
+	 */
 
-	case PACKET_OTHERHOST:
-		/* Our lower layer thinks this is not local, let's make sure.
-		 * This allows the VLAN to have a different MAC than the
-		 * underlying device, and still route correctly.
-		 */
-		if (!compare_ether_addr(eth_hdr(skb)->h_dest,
-					skb->dev->dev_addr))
-			skb->pkt_type = PACKET_HOST;
-		break;
-	default:
-		break;
+	if (!vlan_dev) {
+		if (vlan_id) {
+			pr_debug("%s: ERROR: No net_device for VID: %u on dev: %s\n",
+				 __func__, vlan_id, dev->name);
+			goto err_unlock;
+		}
+		rx_stats = NULL;
+	} else {
+		skb->dev = vlan_dev;
+
+		rx_stats = this_cpu_ptr(vlan_dev_info(skb->dev)->vlan_rx_stats);
+
+		u64_stats_update_begin(&rx_stats->syncp);
+		rx_stats->rx_packets++;
+		rx_stats->rx_bytes += skb->len;
+
+		skb->priority = vlan_get_ingress_priority(skb->dev, vlan_tci);
+
+		pr_debug("%s: priority: %u for TCI: %hu\n",
+			 __func__, skb->priority, vlan_tci);
+
+		switch (skb->pkt_type) {
+		case PACKET_BROADCAST:
+			/* Yeah, stats collect these together.. */
+			/* stats->broadcast ++; // no such counter :-( */
+			break;
+
+		case PACKET_MULTICAST:
+			rx_stats->rx_multicast++;
+			break;
+
+		case PACKET_OTHERHOST:
+			/* Our lower layer thinks this is not local, let's make
+			 * sure.
+			 * This allows the VLAN to have a different MAC than the
+			 * underlying device, and still route correctly.
+			 */
+			if (!compare_ether_addr(eth_hdr(skb)->h_dest,
+						skb->dev->dev_addr))
+				skb->pkt_type = PACKET_HOST;
+			break;
+		default:
+			break;
+		}
+		u64_stats_update_end(&rx_stats->syncp);
 	}
 
+	skb_pull_rcsum(skb, VLAN_HLEN);
 	vlan_set_encap_proto(skb, vhdr);
 
-	skb = vlan_check_reorder_header(skb);
-	if (!skb) {
-		rx_stats->rx_errors++;
-		goto err_unlock;
+	if (vlan_dev) {
+		skb = vlan_check_reorder_header(skb);
+		if (!skb) {
+			rx_stats->rx_errors++;
+			goto err_unlock;
+		}
 	}
 
 	netif_rx(skb);
+
 	rcu_read_unlock();
 	return NET_RX_SUCCESS;
 
 err_unlock:
 	rcu_read_unlock();
 err_free:
+	atomic_long_inc(&dev->rx_dropped);
 	kfree_skb(skb);
 	return NET_RX_DROP;
 }
@@ -491,7 +512,8 @@ static int vlan_dev_open(struct net_device *dev)
 	if (vlan->flags & VLAN_FLAG_GVRP)
 		vlan_gvrp_request_join(dev);
 
-	netif_carrier_on(dev);
+	if (netif_carrier_ok(real_dev))
+		netif_carrier_on(dev);
 	return 0;
 
 clear_allmulti:
@@ -801,37 +823,65 @@ static u32 vlan_ethtool_get_flags(struct net_device *dev)
 	return dev_ethtool_get_flags(vlan->real_dev);
 }
 
-static struct net_device_stats *vlan_dev_get_stats(struct net_device *dev)
+static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
-	struct net_device_stats *stats = &dev->stats;
-
 	dev_txq_stats_fold(dev, stats);
 
 	if (vlan_dev_info(dev)->vlan_rx_stats) {
-		struct vlan_rx_stats *p, rx = {0};
+		struct vlan_rx_stats *p, accum = {0};
 		int i;
 
 		for_each_possible_cpu(i) {
+			u64 rxpackets, rxbytes, rxmulticast;
+			unsigned int start;
+
 			p = per_cpu_ptr(vlan_dev_info(dev)->vlan_rx_stats, i);
-			rx.rx_packets += p->rx_packets;
-			rx.rx_bytes   += p->rx_bytes;
-			rx.rx_errors  += p->rx_errors;
-			rx.multicast  += p->multicast;
+			do {
+				start = u64_stats_fetch_begin_bh(&p->syncp);
+				rxpackets	= p->rx_packets;
+				rxbytes		= p->rx_bytes;
+				rxmulticast	= p->rx_multicast;
+			} while (u64_stats_fetch_retry_bh(&p->syncp, start));
+			accum.rx_packets += rxpackets;
+			accum.rx_bytes   += rxbytes;
+			accum.rx_multicast += rxmulticast;
+			/* rx_errors is ulong, not protected by syncp */
+			accum.rx_errors  += p->rx_errors;
 		}
-		stats->rx_packets = rx.rx_packets;
-		stats->rx_bytes   = rx.rx_bytes;
-		stats->rx_errors  = rx.rx_errors;
-		stats->multicast  = rx.multicast;
+		stats->rx_packets = accum.rx_packets;
+		stats->rx_bytes   = accum.rx_bytes;
+		stats->rx_errors  = accum.rx_errors;
+		stats->multicast  = accum.rx_multicast;
 	}
 	return stats;
 }
 
+static int vlan_ethtool_set_tso(struct net_device *dev, u32 data)
+{
+       if (data) {
+		struct net_device *real_dev = vlan_dev_info(dev)->real_dev;
+
+		/* Underlying device must support TSO for VLAN-tagged packets
+		 * and must have TSO enabled now.
+		 */
+		if (!(real_dev->vlan_features & NETIF_F_TSO))
+			return -EOPNOTSUPP;
+		if (!(real_dev->features & NETIF_F_TSO))
+			return -EINVAL;
+		dev->features |= NETIF_F_TSO;
+	} else {
+		dev->features &= ~NETIF_F_TSO;
+	}
+	return 0;
+}
+
 static const struct ethtool_ops vlan_ethtool_ops = {
 	.get_settings	        = vlan_ethtool_get_settings,
 	.get_drvinfo	        = vlan_ethtool_get_drvinfo,
 	.get_link		= ethtool_op_get_link,
 	.get_rx_csum		= vlan_ethtool_get_rx_csum,
 	.get_flags		= vlan_ethtool_get_flags,
+	.set_tso                = vlan_ethtool_set_tso,
 };
 
 static const struct net_device_ops vlan_netdev_ops = {
@@ -848,7 +898,7 @@ static const struct net_device_ops vlan_netdev_ops = {
 	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
 	.ndo_do_ioctl		= vlan_dev_ioctl,
 	.ndo_neigh_setup	= vlan_dev_neigh_setup,
-	.ndo_get_stats		= vlan_dev_get_stats,
+	.ndo_get_stats64	= vlan_dev_get_stats64,
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
 	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,
@@ -872,7 +922,7 @@ static const struct net_device_ops vlan_netdev_accel_ops = {
 	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
 	.ndo_do_ioctl		= vlan_dev_ioctl,
 	.ndo_neigh_setup	= vlan_dev_neigh_setup,
-	.ndo_get_stats		= vlan_dev_get_stats,
+	.ndo_get_stats64	= vlan_dev_get_stats64,
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
 	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,
@@ -897,7 +947,7 @@ static const struct net_device_ops vlan_netdev_ops_sq = {
 	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
 	.ndo_do_ioctl		= vlan_dev_ioctl,
 	.ndo_neigh_setup	= vlan_dev_neigh_setup,
-	.ndo_get_stats		= vlan_dev_get_stats,
+	.ndo_get_stats64	= vlan_dev_get_stats64,
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
 	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,
@@ -922,7 +972,7 @@ static const struct net_device_ops vlan_netdev_accel_ops_sq = {
 	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
 	.ndo_do_ioctl		= vlan_dev_ioctl,
 	.ndo_neigh_setup	= vlan_dev_neigh_setup,
-	.ndo_get_stats		= vlan_dev_get_stats,
+	.ndo_get_stats64	= vlan_dev_get_stats64,
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
 	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,
diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index afead353e21..80e280f5668 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -278,25 +278,27 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset)
 {
 	struct net_device *vlandev = (struct net_device *) seq->private;
 	const struct vlan_dev_info *dev_info = vlan_dev_info(vlandev);
-	const struct net_device_stats *stats;
+	struct rtnl_link_stats64 temp;
+	const struct rtnl_link_stats64 *stats;
 	static const char fmt[] = "%30s %12lu\n";
+	static const char fmt64[] = "%30s %12llu\n";
 	int i;
 
 	if (!is_vlan_dev(vlandev))
 		return 0;
 
-	stats = dev_get_stats(vlandev);
+	stats = dev_get_stats(vlandev, &temp);
 	seq_printf(seq,
 		   "%s  VID: %d	 REORDER_HDR: %i  dev->priv_flags: %hx\n",
 		   vlandev->name, dev_info->vlan_id,
 		   (int)(dev_info->flags & 1), vlandev->priv_flags);
 
-	seq_printf(seq, fmt, "total frames received", stats->rx_packets);
-	seq_printf(seq, fmt, "total bytes received", stats->rx_bytes);
-	seq_printf(seq, fmt, "Broadcast/Multicast Rcvd", stats->multicast);
+	seq_printf(seq, fmt64, "total frames received", stats->rx_packets);
+	seq_printf(seq, fmt64, "total bytes received", stats->rx_bytes);
+	seq_printf(seq, fmt64, "Broadcast/Multicast Rcvd", stats->multicast);
 	seq_puts(seq, "\n");
-	seq_printf(seq, fmt, "total frames transmitted", stats->tx_packets);
-	seq_printf(seq, fmt, "total bytes transmitted", stats->tx_bytes);
+	seq_printf(seq, fmt64, "total frames transmitted", stats->tx_packets);
+	seq_printf(seq, fmt64, "total bytes transmitted", stats->tx_bytes);
 	seq_printf(seq, fmt, "total headroom inc",
 		   dev_info->cnt_inc_headroom_on_tx);
 	seq_printf(seq, fmt, "total encap on xmit",
diff --git a/net/9p/client.c b/net/9p/client.c
index 37c8da07a80..a848bca9fbf 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -61,13 +61,13 @@ static const match_table_t tokens = {
 
 inline int p9_is_proto_dotl(struct p9_client *clnt)
 {
-	return (clnt->proto_version == p9_proto_2000L);
+	return clnt->proto_version == p9_proto_2000L;
 }
 EXPORT_SYMBOL(p9_is_proto_dotl);
 
 inline int p9_is_proto_dotu(struct p9_client *clnt)
 {
-	return (clnt->proto_version == p9_proto_2000u);
+	return clnt->proto_version == p9_proto_2000u;
 }
 EXPORT_SYMBOL(p9_is_proto_dotu);
 
@@ -331,8 +331,10 @@ static void p9_tag_cleanup(struct p9_client *c)
 		}
 	}
 
-	if (c->tagpool)
+	if (c->tagpool) {
+		p9_idpool_put(0, c->tagpool); /* free reserved tag 0 */
 		p9_idpool_destroy(c->tagpool);
+	}
 
 	/* free requests associated with tags */
 	for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) {
@@ -448,31 +450,43 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
 		return err;
 	}
 
-	if (type == P9_RERROR) {
+	if (type == P9_RERROR || type == P9_RLERROR) {
 		int ecode;
-		char *ename;
 
-		err = p9pdu_readf(req->rc, c->proto_version, "s?d",
-							&ename, &ecode);
-		if (err) {
-			P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse error%d\n",
-									err);
-			return err;
-		}
+		if (!p9_is_proto_dotl(c)) {
+			char *ename;
 
-		if (p9_is_proto_dotu(c))
-			err = -ecode;
+			err = p9pdu_readf(req->rc, c->proto_version, "s?d",
+								&ename, &ecode);
+			if (err)
+				goto out_err;
+
+			if (p9_is_proto_dotu(c))
+				err = -ecode;
 
-		if (!err || !IS_ERR_VALUE(err))
-			err = p9_errstr2errno(ename, strlen(ename));
+			if (!err || !IS_ERR_VALUE(err)) {
+				err = p9_errstr2errno(ename, strlen(ename));
 
-		P9_DPRINTK(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", -ecode, ename);
+				P9_DPRINTK(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", -ecode, ename);
+
+				kfree(ename);
+			}
+		} else {
+			err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode);
+			err = -ecode;
+
+			P9_DPRINTK(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode);
+		}
 
-		kfree(ename);
 	} else
 		err = 0;
 
 	return err;
+
+out_err:
+	P9_DPRINTK(P9_DEBUG_ERROR, "couldn't parse error%d\n", err);
+
+	return err;
 }
 
 /**
@@ -565,11 +579,14 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
 	va_start(ap, fmt);
 	err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap);
 	va_end(ap);
+	if (err)
+		goto reterr;
 	p9pdu_finalize(req->tc);
 
 	err = c->trans_mod->request(c, req);
 	if (err < 0) {
-		c->status = Disconnected;
+		if (err != -ERESTARTSYS)
+			c->status = Disconnected;
 		goto reterr;
 	}
 
@@ -668,7 +685,7 @@ static void p9_fid_destroy(struct p9_fid *fid)
 	kfree(fid);
 }
 
-int p9_client_version(struct p9_client *c)
+static int p9_client_version(struct p9_client *c)
 {
 	int err = 0;
 	struct p9_req_t *req;
@@ -727,7 +744,6 @@ error:
 
 	return err;
 }
-EXPORT_SYMBOL(p9_client_version);
 
 struct p9_client *p9_client_create(const char *dev_name, char *options)
 {
@@ -884,54 +900,6 @@ error:
 }
 EXPORT_SYMBOL(p9_client_attach);
 
-struct p9_fid *
-p9_client_auth(struct p9_client *clnt, char *uname, u32 n_uname, char *aname)
-{
-	int err;
-	struct p9_req_t *req;
-	struct p9_qid qid;
-	struct p9_fid *afid;
-
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TAUTH uname %s aname %s\n", uname, aname);
-	err = 0;
-
-	afid = p9_fid_create(clnt);
-	if (IS_ERR(afid)) {
-		err = PTR_ERR(afid);
-		afid = NULL;
-		goto error;
-	}
-
-	req = p9_client_rpc(clnt, P9_TAUTH, "dss?d",
-			afid ? afid->fid : P9_NOFID, uname, aname, n_uname);
-	if (IS_ERR(req)) {
-		err = PTR_ERR(req);
-		goto error;
-	}
-
-	err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid);
-	if (err) {
-		p9pdu_dump(1, req->rc);
-		p9_free_req(clnt, req);
-		goto error;
-	}
-
-	P9_DPRINTK(P9_DEBUG_9P, "<<< RAUTH qid %x.%llx.%x\n",
-					qid.type,
-					(unsigned long long)qid.path,
-					qid.version);
-
-	memmove(&afid->qid, &qid, sizeof(struct p9_qid));
-	p9_free_req(clnt, req);
-	return afid;
-
-error:
-	if (afid)
-		p9_fid_destroy(afid);
-	return ERR_PTR(err);
-}
-EXPORT_SYMBOL(p9_client_auth);
-
 struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames,
 	int clone)
 {
@@ -943,6 +911,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames,
 	int16_t nwqids, count;
 
 	err = 0;
+	wqids = NULL;
 	clnt = oldfid->clnt;
 	if (clone) {
 		fid = p9_fid_create(clnt);
@@ -993,9 +962,11 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames,
 	else
 		fid->qid = oldfid->qid;
 
+	kfree(wqids);
 	return fid;
 
 clunk_fid:
+	kfree(wqids);
 	p9_client_clunk(fid);
 	fid = NULL;
 
@@ -1015,14 +986,18 @@ int p9_client_open(struct p9_fid *fid, int mode)
 	struct p9_qid qid;
 	int iounit;
 
-	P9_DPRINTK(P9_DEBUG_9P, ">>> TOPEN fid %d mode %d\n", fid->fid, mode);
-	err = 0;
 	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> %s fid %d mode %d\n",
+		p9_is_proto_dotl(clnt) ? "TLOPEN" : "TOPEN", fid->fid, mode);
+	err = 0;
 
 	if (fid->mode != -1)
 		return -EINVAL;
 
-	req = p9_client_rpc(clnt, P9_TOPEN, "db", fid->fid, mode);
+	if (p9_is_proto_dotl(clnt))
+		req = p9_client_rpc(clnt, P9_TLOPEN, "dd", fid->fid, mode);
+	else
+		req = p9_client_rpc(clnt, P9_TOPEN, "db", fid->fid, mode);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto error;
@@ -1034,10 +1009,9 @@ int p9_client_open(struct p9_fid *fid, int mode)
 		goto free_and_error;
 	}
 
-	P9_DPRINTK(P9_DEBUG_9P, "<<< ROPEN qid %x.%llx.%x iounit %x\n",
-				qid.type,
-				(unsigned long long)qid.path,
-				qid.version, iounit);
+	P9_DPRINTK(P9_DEBUG_9P, "<<< %s qid %x.%llx.%x iounit %x\n",
+		p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN",  qid.type,
+		(unsigned long long)qid.path, qid.version, iounit);
 
 	fid->mode = mode;
 	fid->iounit = iounit;
@@ -1049,6 +1023,50 @@ error:
 }
 EXPORT_SYMBOL(p9_client_open);
 
+int p9_client_create_dotl(struct p9_fid *ofid, char *name, u32 flags, u32 mode,
+		gid_t gid, struct p9_qid *qid)
+{
+	int err = 0;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+	int iounit;
+
+	P9_DPRINTK(P9_DEBUG_9P,
+			">>> TLCREATE fid %d name %s flags %d mode %d gid %d\n",
+			ofid->fid, name, flags, mode, gid);
+	clnt = ofid->clnt;
+
+	if (ofid->mode != -1)
+		return -EINVAL;
+
+	req = p9_client_rpc(clnt, P9_TLCREATE, "dsddd", ofid->fid, name, flags,
+			mode, gid);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", qid, &iounit);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto free_and_error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RLCREATE qid %x.%llx.%x iounit %x\n",
+			qid->type,
+			(unsigned long long)qid->path,
+			qid->version, iounit);
+
+	ofid->mode = mode;
+	ofid->iounit = iounit;
+
+free_and_error:
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_create_dotl);
+
 int p9_client_fcreate(struct p9_fid *fid, char *name, u32 perm, int mode,
 		     char *extension)
 {
@@ -1094,12 +1112,97 @@ error:
 }
 EXPORT_SYMBOL(p9_client_fcreate);
 
+int p9_client_symlink(struct p9_fid *dfid, char *name, char *symtgt, gid_t gid,
+		struct p9_qid *qid)
+{
+	int err = 0;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TSYMLINK dfid %d name %s  symtgt %s\n",
+			dfid->fid, name, symtgt);
+	clnt = dfid->clnt;
+
+	req = p9_client_rpc(clnt, P9_TSYMLINK, "dssd", dfid->fid, name, symtgt,
+			gid);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto free_and_error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RSYMLINK qid %x.%llx.%x\n",
+			qid->type, (unsigned long long)qid->path, qid->version);
+
+free_and_error:
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_symlink);
+
+int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, char *newname)
+{
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TLINK dfid %d oldfid %d newname %s\n",
+			dfid->fid, oldfid->fid, newname);
+	clnt = dfid->clnt;
+	req = p9_client_rpc(clnt, P9_TLINK, "dds", dfid->fid, oldfid->fid,
+			newname);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RLINK\n");
+	p9_free_req(clnt, req);
+	return 0;
+}
+EXPORT_SYMBOL(p9_client_link);
+
+int p9_client_fsync(struct p9_fid *fid, int datasync)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TFSYNC fid %d datasync:%d\n",
+			fid->fid, datasync);
+	err = 0;
+	clnt = fid->clnt;
+
+	req = p9_client_rpc(clnt, P9_TFSYNC, "dd", fid->fid, datasync);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid);
+
+	p9_free_req(clnt, req);
+
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_fsync);
+
 int p9_client_clunk(struct p9_fid *fid)
 {
 	int err;
 	struct p9_client *clnt;
 	struct p9_req_t *req;
 
+	if (!fid) {
+		P9_EPRINTK(KERN_WARNING, "Trying to clunk with NULL fid\n");
+		dump_stack();
+		return 0;
+	}
+
 	P9_DPRINTK(P9_DEBUG_9P, ">>> TCLUNK fid %d\n", fid->fid);
 	err = 0;
 	clnt = fid->clnt;
@@ -1139,9 +1242,8 @@ int p9_client_remove(struct p9_fid *fid)
 	P9_DPRINTK(P9_DEBUG_9P, "<<< RREMOVE fid %d\n", fid->fid);
 
 	p9_free_req(clnt, req);
-	p9_fid_destroy(fid);
-
 error:
+	p9_fid_destroy(fid);
 	return err;
 }
 EXPORT_SYMBOL(p9_client_remove);
@@ -1184,16 +1286,13 @@ p9_client_read(struct p9_fid *fid, char *data, char __user *udata, u64 offset,
 
 	if (data) {
 		memmove(data, dataptr, count);
-	}
-
-	if (udata) {
+	} else {
 		err = copy_to_user(udata, dataptr, count);
 		if (err) {
 			err = -EFAULT;
 			goto free_and_error;
 		}
 	}
-
 	p9_free_req(clnt, req);
 	return count;
 
@@ -1302,6 +1401,65 @@ error:
 }
 EXPORT_SYMBOL(p9_client_stat);
 
+struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid,
+							u64 request_mask)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_stat_dotl *ret = kmalloc(sizeof(struct p9_stat_dotl),
+								GFP_KERNEL);
+	struct p9_req_t *req;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TGETATTR fid %d, request_mask %lld\n",
+							fid->fid, request_mask);
+
+	if (!ret)
+		return ERR_PTR(-ENOMEM);
+
+	err = 0;
+	clnt = fid->clnt;
+
+	req = p9_client_rpc(clnt, P9_TGETATTR, "dq", fid->fid, request_mask);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "A", ret);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		p9_free_req(clnt, req);
+		goto error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P,
+		"<<< RGETATTR st_result_mask=%lld\n"
+		"<<< qid=%x.%llx.%x\n"
+		"<<< st_mode=%8.8x st_nlink=%llu\n"
+		"<<< st_uid=%d st_gid=%d\n"
+		"<<< st_rdev=%llx st_size=%llx st_blksize=%llu st_blocks=%llu\n"
+		"<<< st_atime_sec=%lld st_atime_nsec=%lld\n"
+		"<<< st_mtime_sec=%lld st_mtime_nsec=%lld\n"
+		"<<< st_ctime_sec=%lld st_ctime_nsec=%lld\n"
+		"<<< st_btime_sec=%lld st_btime_nsec=%lld\n"
+		"<<< st_gen=%lld st_data_version=%lld",
+		ret->st_result_mask, ret->qid.type, ret->qid.path,
+		ret->qid.version, ret->st_mode, ret->st_nlink, ret->st_uid,
+		ret->st_gid, ret->st_rdev, ret->st_size, ret->st_blksize,
+		ret->st_blocks, ret->st_atime_sec, ret->st_atime_nsec,
+		ret->st_mtime_sec, ret->st_mtime_nsec, ret->st_ctime_sec,
+		ret->st_ctime_nsec, ret->st_btime_sec, ret->st_btime_nsec,
+		ret->st_gen, ret->st_data_version);
+
+	p9_free_req(clnt, req);
+	return ret;
+
+error:
+	kfree(ret);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(p9_client_getattr_dotl);
+
 static int p9_client_statsize(struct p9_wstat *wst, int proto_version)
 {
 	int ret;
@@ -1366,6 +1524,36 @@ error:
 }
 EXPORT_SYMBOL(p9_client_wstat);
 
+int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr)
+{
+	int err;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+
+	err = 0;
+	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TSETATTR fid %d\n", fid->fid);
+	P9_DPRINTK(P9_DEBUG_9P,
+		"    valid=%x mode=%x uid=%d gid=%d size=%lld\n"
+		"    atime_sec=%lld atime_nsec=%lld\n"
+		"    mtime_sec=%lld mtime_nsec=%lld\n",
+		p9attr->valid, p9attr->mode, p9attr->uid, p9attr->gid,
+		p9attr->size, p9attr->atime_sec, p9attr->atime_nsec,
+		p9attr->mtime_sec, p9attr->mtime_nsec);
+
+	req = p9_client_rpc(clnt, P9_TSETATTR, "dI", fid->fid, p9attr);
+
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RSETATTR fid %d\n", fid->fid);
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_setattr);
+
 int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb)
 {
 	int err;
@@ -1432,3 +1620,280 @@ error:
 }
 EXPORT_SYMBOL(p9_client_rename);
 
+/*
+ * An xattrwalk without @attr_name gives the fid for the lisxattr namespace
+ */
+struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid,
+				const char *attr_name, u64 *attr_size)
+{
+	int err;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+	struct p9_fid *attr_fid;
+
+	err = 0;
+	clnt = file_fid->clnt;
+	attr_fid = p9_fid_create(clnt);
+	if (IS_ERR(attr_fid)) {
+		err = PTR_ERR(attr_fid);
+		attr_fid = NULL;
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P,
+		">>> TXATTRWALK file_fid %d, attr_fid %d name %s\n",
+		file_fid->fid, attr_fid->fid, attr_name);
+
+	req = p9_client_rpc(clnt, P9_TXATTRWALK, "dds",
+			file_fid->fid, attr_fid->fid, attr_name);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+	err = p9pdu_readf(req->rc, clnt->proto_version, "q", attr_size);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		p9_free_req(clnt, req);
+		goto clunk_fid;
+	}
+	p9_free_req(clnt, req);
+	P9_DPRINTK(P9_DEBUG_9P, "<<<  RXATTRWALK fid %d size %llu\n",
+		attr_fid->fid, *attr_size);
+	return attr_fid;
+clunk_fid:
+	p9_client_clunk(attr_fid);
+	attr_fid = NULL;
+error:
+	if (attr_fid && (attr_fid != file_fid))
+		p9_fid_destroy(attr_fid);
+
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(p9_client_xattrwalk);
+
+int p9_client_xattrcreate(struct p9_fid *fid, const char *name,
+			u64 attr_size, int flags)
+{
+	int err;
+	struct p9_req_t *req;
+	struct p9_client *clnt;
+
+	P9_DPRINTK(P9_DEBUG_9P,
+		">>> TXATTRCREATE fid %d name  %s size %lld flag %d\n",
+		fid->fid, name, (long long)attr_size, flags);
+	err = 0;
+	clnt = fid->clnt;
+	req = p9_client_rpc(clnt, P9_TXATTRCREATE, "dsqd",
+			fid->fid, name, attr_size, flags);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RXATTRCREATE fid %d\n", fid->fid);
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL_GPL(p9_client_xattrcreate);
+
+int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset)
+{
+	int err, rsize, total;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+	char *dataptr;
+
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n",
+				fid->fid, (long long unsigned) offset, count);
+
+	err = 0;
+	clnt = fid->clnt;
+	total = 0;
+
+	rsize = fid->iounit;
+	if (!rsize || rsize > clnt->msize-P9_READDIRHDRSZ)
+		rsize = clnt->msize - P9_READDIRHDRSZ;
+
+	if (count < rsize)
+		rsize = count;
+
+	req = p9_client_rpc(clnt, P9_TREADDIR, "dqd", fid->fid, offset, rsize);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto error;
+	}
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto free_and_error;
+	}
+
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count);
+
+	if (data)
+		memmove(data, dataptr, count);
+
+	p9_free_req(clnt, req);
+	return count;
+
+free_and_error:
+	p9_free_req(clnt, req);
+error:
+	return err;
+}
+EXPORT_SYMBOL(p9_client_readdir);
+
+int p9_client_mknod_dotl(struct p9_fid *fid, char *name, int mode,
+			dev_t rdev, gid_t gid, struct p9_qid *qid)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	err = 0;
+	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TMKNOD fid %d name %s mode %d major %d "
+		"minor %d\n", fid->fid, name, mode, MAJOR(rdev), MINOR(rdev));
+	req = p9_client_rpc(clnt, P9_TMKNOD, "dsdddd", fid->fid, name, mode,
+		MAJOR(rdev), MINOR(rdev), gid);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n", qid->type,
+				(unsigned long long)qid->path, qid->version);
+
+error:
+	p9_free_req(clnt, req);
+	return err;
+
+}
+EXPORT_SYMBOL(p9_client_mknod_dotl);
+
+int p9_client_mkdir_dotl(struct p9_fid *fid, char *name, int mode,
+				gid_t gid, struct p9_qid *qid)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	err = 0;
+	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TMKDIR fid %d name %s mode %d gid %d\n",
+		 fid->fid, name, mode, gid);
+	req = p9_client_rpc(clnt, P9_TMKDIR, "dsdd", fid->fid, name, mode,
+		gid);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type,
+				(unsigned long long)qid->path, qid->version);
+
+error:
+	p9_free_req(clnt, req);
+	return err;
+
+}
+EXPORT_SYMBOL(p9_client_mkdir_dotl);
+
+int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	err = 0;
+	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TLOCK fid %d type %i flags %d "
+			"start %lld length %lld proc_id %d client_id %s\n",
+			fid->fid, flock->type, flock->flags, flock->start,
+			flock->length, flock->proc_id, flock->client_id);
+
+	req = p9_client_rpc(clnt, P9_TLOCK, "dbdqqds", fid->fid, flock->type,
+				flock->flags, flock->start, flock->length,
+					flock->proc_id, flock->client_id);
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "b", status);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status);
+error:
+	p9_free_req(clnt, req);
+	return err;
+
+}
+EXPORT_SYMBOL(p9_client_lock_dotl);
+
+int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	err = 0;
+	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TGETLOCK fid %d, type %i start %lld "
+		"length %lld proc_id %d client_id %s\n", fid->fid, glock->type,
+		glock->start, glock->length, glock->proc_id, glock->client_id);
+
+	req = p9_client_rpc(clnt, P9_TGETLOCK, "dbqqds", fid->fid,  glock->type,
+		glock->start, glock->length, glock->proc_id, glock->client_id);
+
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "bqqds", &glock->type,
+			&glock->start, &glock->length, &glock->proc_id,
+			&glock->client_id);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld "
+		"proc_id %d client_id %s\n", glock->type, glock->start,
+		glock->length, glock->proc_id, glock->client_id);
+error:
+	p9_free_req(clnt, req);
+	return err;
+}
+EXPORT_SYMBOL(p9_client_getlock_dotl);
+
+int p9_client_readlink(struct p9_fid *fid, char **target)
+{
+	int err;
+	struct p9_client *clnt;
+	struct p9_req_t *req;
+
+	err = 0;
+	clnt = fid->clnt;
+	P9_DPRINTK(P9_DEBUG_9P, ">>> TREADLINK fid %d\n", fid->fid);
+
+	req = p9_client_rpc(clnt, P9_TREADLINK, "d", fid->fid);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = p9pdu_readf(req->rc, clnt->proto_version, "s", target);
+	if (err) {
+		p9pdu_dump(1, req->rc);
+		goto error;
+	}
+	P9_DPRINTK(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target);
+error:
+	p9_free_req(clnt, req);
+	return err;
+}
+EXPORT_SYMBOL(p9_client_readlink);
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
index 149f8216013..45c15f49140 100644
--- a/net/9p/protocol.c
+++ b/net/9p/protocol.c
@@ -122,9 +122,8 @@ static size_t
 pdu_write_u(struct p9_fcall *pdu, const char __user *udata, size_t size)
 {
 	size_t len = MIN(pdu->capacity - pdu->size, size);
-	int err = copy_from_user(&pdu->sdata[pdu->size], udata, len);
-	if (err)
-		printk(KERN_WARNING "pdu_write_u returning: %d\n", err);
+	if (copy_from_user(&pdu->sdata[pdu->size], udata, len))
+		len = 0;
 
 	pdu->size += len;
 	return size - len;
@@ -141,6 +140,7 @@ pdu_write_u(struct p9_fcall *pdu, const char __user *udata, size_t size)
 	D - data blob (int32_t size followed by void *, results are not freed)
 	T - array of strings (int16_t count, followed by strings)
 	R - array of qids (int16_t count, followed by qids)
+	A - stat for 9p2000.L (p9_stat_dotl)
 	? - if optional = 1, continue parsing
 */
 
@@ -340,6 +340,33 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
 				}
 			}
 			break;
+		case 'A': {
+				struct p9_stat_dotl *stbuf =
+				    va_arg(ap, struct p9_stat_dotl *);
+
+				memset(stbuf, 0, sizeof(struct p9_stat_dotl));
+				errcode =
+				    p9pdu_readf(pdu, proto_version,
+					"qQdddqqqqqqqqqqqqqqq",
+					&stbuf->st_result_mask,
+					&stbuf->qid,
+					&stbuf->st_mode,
+					&stbuf->st_uid, &stbuf->st_gid,
+					&stbuf->st_nlink,
+					&stbuf->st_rdev, &stbuf->st_size,
+					&stbuf->st_blksize, &stbuf->st_blocks,
+					&stbuf->st_atime_sec,
+					&stbuf->st_atime_nsec,
+					&stbuf->st_mtime_sec,
+					&stbuf->st_mtime_nsec,
+					&stbuf->st_ctime_sec,
+					&stbuf->st_ctime_nsec,
+					&stbuf->st_btime_sec,
+					&stbuf->st_btime_nsec,
+					&stbuf->st_gen,
+					&stbuf->st_data_version);
+			}
+			break;
 		case '?':
 			if ((proto_version != p9_proto_2000u) &&
 				(proto_version != p9_proto_2000L))
@@ -488,6 +515,23 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
 				}
 			}
 			break;
+		case 'I':{
+				struct p9_iattr_dotl *p9attr = va_arg(ap,
+							struct p9_iattr_dotl *);
+
+				errcode = p9pdu_writef(pdu, proto_version,
+							"ddddqqqqq",
+							p9attr->valid,
+							p9attr->mode,
+							p9attr->uid,
+							p9attr->gid,
+							p9attr->size,
+							p9attr->atime_sec,
+							p9attr->atime_nsec,
+							p9attr->mtime_sec,
+							p9attr->mtime_nsec);
+			}
+			break;
 		case '?':
 			if ((proto_version != p9_proto_2000u) &&
 				(proto_version != p9_proto_2000L))
@@ -580,3 +624,30 @@ void p9pdu_reset(struct p9_fcall *pdu)
 	pdu->offset = 0;
 	pdu->size = 0;
 }
+
+int p9dirent_read(char *buf, int len, struct p9_dirent *dirent,
+						int proto_version)
+{
+	struct p9_fcall fake_pdu;
+	int ret;
+	char *nameptr;
+
+	fake_pdu.size = len;
+	fake_pdu.capacity = len;
+	fake_pdu.sdata = buf;
+	fake_pdu.offset = 0;
+
+	ret = p9pdu_readf(&fake_pdu, proto_version, "Qqbs", &dirent->qid,
+			&dirent->d_off, &dirent->d_type, &nameptr);
+	if (ret) {
+		P9_DPRINTK(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret);
+		p9pdu_dump(1, &fake_pdu);
+		goto out;
+	}
+
+	strcpy(dirent->d_name, nameptr);
+
+out:
+	return fake_pdu.offset;
+}
+EXPORT_SYMBOL(p9dirent_read);
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 98ce9bcb0e1..078eb162d9b 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -222,7 +222,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
 	}
 }
 
-static unsigned int
+static int
 p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt)
 {
 	int ret, n;
@@ -948,7 +948,7 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args)
 
 	csocket = NULL;
 
-	if (strlen(addr) > UNIX_PATH_MAX) {
+	if (strlen(addr) >= UNIX_PATH_MAX) {
 		P9_EPRINTK(KERN_ERR, "p9_trans_unix: address too long: %s\n",
 			addr);
 		return -ENAMETOOLONG;
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 0ea20c30466..17c5ba7551a 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -426,8 +426,10 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
 
 	/* Allocate an fcall for the reply */
 	rpl_context = kmalloc(sizeof *rpl_context, GFP_KERNEL);
-	if (!rpl_context)
+	if (!rpl_context) {
+		err = -ENOMEM;
 		goto err_close;
+	}
 
 	/*
 	 * If the request has a buffer, steal it, otherwise
@@ -445,8 +447,8 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
 	}
 	rpl_context->rc = req->rc;
 	if (!rpl_context->rc) {
-		kfree(rpl_context);
-		goto err_close;
+		err = -ENOMEM;
+		goto err_free2;
 	}
 
 	/*
@@ -458,11 +460,8 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
 	 */
 	if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) {
 		err = post_recv(client, rpl_context);
-		if (err) {
-			kfree(rpl_context->rc);
-			kfree(rpl_context);
-			goto err_close;
-		}
+		if (err)
+			goto err_free1;
 	} else
 		atomic_dec(&rdma->rq_count);
 
@@ -471,8 +470,10 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
 
 	/* Post the request */
 	c = kmalloc(sizeof *c, GFP_KERNEL);
-	if (!c)
-		goto err_close;
+	if (!c) {
+		err = -ENOMEM;
+		goto err_free1;
+	}
 	c->req = req;
 
 	c->busa = ib_dma_map_single(rdma->cm_id->device,
@@ -499,9 +500,15 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
 	return ib_post_send(rdma->qp, &wr, &bad_wr);
 
  error:
+	kfree(c);
+	kfree(rpl_context->rc);
+	kfree(rpl_context);
 	P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n");
 	return -EIO;
-
+ err_free1:
+	kfree(rpl_context->rc);
+ err_free2:
+	kfree(rpl_context);
  err_close:
 	spin_lock_irqsave(&rdma->req_lock, flags);
 	if (rdma->state < P9_RDMA_CLOSING) {
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index dcfbe99ff81..c8f3f72ab20 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -75,6 +75,8 @@ struct virtio_chan {
 	struct p9_client *client;
 	struct virtio_device *vdev;
 	struct virtqueue *vq;
+	int ring_bufs_avail;
+	wait_queue_head_t *vc_wq;
 
 	/* Scatterlist: can be too big for stack. */
 	struct scatterlist sg[VIRTQUEUE_NUM];
@@ -134,16 +136,30 @@ static void req_done(struct virtqueue *vq)
 	struct p9_fcall *rc;
 	unsigned int len;
 	struct p9_req_t *req;
+	unsigned long flags;
 
 	P9_DPRINTK(P9_DEBUG_TRANS, ": request done\n");
 
-	while ((rc = virtqueue_get_buf(chan->vq, &len)) != NULL) {
-		P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc);
-		P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag);
-		req = p9_tag_lookup(chan->client, rc->tag);
-		req->status = REQ_STATUS_RCVD;
-		p9_client_cb(chan->client, req);
-	}
+	do {
+		spin_lock_irqsave(&chan->lock, flags);
+		rc = virtqueue_get_buf(chan->vq, &len);
+
+		if (rc != NULL) {
+			if (!chan->ring_bufs_avail) {
+				chan->ring_bufs_avail = 1;
+				wake_up(chan->vc_wq);
+			}
+			spin_unlock_irqrestore(&chan->lock, flags);
+			P9_DPRINTK(P9_DEBUG_TRANS, ": rc %p\n", rc);
+			P9_DPRINTK(P9_DEBUG_TRANS, ": lookup tag %d\n",
+					rc->tag);
+			req = p9_tag_lookup(chan->client, rc->tag);
+			req->status = REQ_STATUS_RCVD;
+			p9_client_cb(chan->client, req);
+		} else {
+			spin_unlock_irqrestore(&chan->lock, flags);
+		}
+	} while (rc != NULL);
 }
 
 /**
@@ -199,23 +215,43 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
 	int in, out;
 	struct virtio_chan *chan = client->trans;
 	char *rdata = (char *)req->rc+sizeof(struct p9_fcall);
+	unsigned long flags;
+	int err;
 
 	P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request\n");
 
+req_retry:
+	req->status = REQ_STATUS_SENT;
+
+	spin_lock_irqsave(&chan->lock, flags);
 	out = pack_sg_list(chan->sg, 0, VIRTQUEUE_NUM, req->tc->sdata,
 								req->tc->size);
 	in = pack_sg_list(chan->sg, out, VIRTQUEUE_NUM-out, rdata,
 								client->msize);
 
-	req->status = REQ_STATUS_SENT;
-
-	if (virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc) < 0) {
-		P9_DPRINTK(P9_DEBUG_TRANS,
-			"9p debug: virtio rpc add_buf returned failure");
-		return -EIO;
+	err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc);
+	if (err < 0) {
+		if (err == -ENOSPC) {
+			chan->ring_bufs_avail = 0;
+			spin_unlock_irqrestore(&chan->lock, flags);
+			err = wait_event_interruptible(*chan->vc_wq,
+							chan->ring_bufs_avail);
+			if (err  == -ERESTARTSYS)
+				return err;
+
+			P9_DPRINTK(P9_DEBUG_TRANS, "9p:Retry virtio request\n");
+			goto req_retry;
+		} else {
+			spin_unlock_irqrestore(&chan->lock, flags);
+			P9_DPRINTK(P9_DEBUG_TRANS,
+					"9p debug: "
+					"virtio rpc add_buf returned failure");
+			return -EIO;
+		}
 	}
 
 	virtqueue_kick(chan->vq);
+	spin_unlock_irqrestore(&chan->lock, flags);
 
 	P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio request kicked\n");
 	return 0;
@@ -290,14 +326,23 @@ static int p9_virtio_probe(struct virtio_device *vdev)
 	chan->tag_len = tag_len;
 	err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
 	if (err) {
-		kfree(tag);
-		goto out_free_vq;
+		goto out_free_tag;
 	}
+	chan->vc_wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL);
+	if (!chan->vc_wq) {
+		err = -ENOMEM;
+		goto out_free_tag;
+	}
+	init_waitqueue_head(chan->vc_wq);
+	chan->ring_bufs_avail = 1;
+
 	mutex_lock(&virtio_9p_lock);
 	list_add_tail(&chan->chan_list, &virtio_chan_list);
 	mutex_unlock(&virtio_9p_lock);
 	return 0;
 
+out_free_tag:
+	kfree(tag);
 out_free_vq:
 	vdev->config->del_vqs(vdev);
 	kfree(chan);
@@ -329,7 +374,8 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args)
 
 	mutex_lock(&virtio_9p_lock);
 	list_for_each_entry(chan, &virtio_chan_list, chan_list) {
-		if (!strncmp(devname, chan->tag, chan->tag_len)) {
+		if (!strncmp(devname, chan->tag, chan->tag_len) &&
+		    strlen(devname) == chan->tag_len) {
 			if (!chan->inuse) {
 				chan->inuse = true;
 				found = 1;
@@ -370,6 +416,7 @@ static void p9_virtio_remove(struct virtio_device *vdev)
 	mutex_unlock(&virtio_9p_lock);
 	sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
 	kfree(chan->tag);
+	kfree(chan->vc_wq);
 	kfree(chan);
 
 }
diff --git a/net/Kconfig b/net/Kconfig
index 0d68b40fc0e..55fd82e9ffd 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -32,7 +32,7 @@ config WANT_COMPAT_NETLINK_MESSAGES
 config COMPAT_NETLINK_MESSAGES
 	def_bool y
 	depends on COMPAT
-	depends on WIRELESS_EXT || WANT_COMPAT_NETLINK_MESSAGES
+	depends on WEXT_CORE || WANT_COMPAT_NETLINK_MESSAGES
 	help
 	  This option makes it possible to send different netlink messages
 	  to tasks depending on whether the task is a compat task or not. To
@@ -86,6 +86,16 @@ config NETWORK_SECMARK
 	  to nfmark, but designated for security purposes.
 	  If you are unsure how to answer this question, answer N.
 
+config NETWORK_PHY_TIMESTAMPING
+	bool "Timestamping in PHY devices"
+	depends on EXPERIMENTAL
+	help
+	  This allows timestamping of network packets by PHYs with
+	  hardware timestamping capabilities. This option adds some
+	  overhead in the transmit and receive paths.
+
+	  If you are unsure how to answer this question, answer N.
+
 menuconfig NETFILTER
 	bool "Network packet filtering framework (Netfilter)"
 	---help---
@@ -203,10 +213,11 @@ source "net/phonet/Kconfig"
 source "net/ieee802154/Kconfig"
 source "net/sched/Kconfig"
 source "net/dcb/Kconfig"
+source "net/dns_resolver/Kconfig"
 
 config RPS
 	boolean
-	depends on SMP && SYSFS
+	depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
 	default y
 
 menu "Network testing"
@@ -282,6 +293,7 @@ source "net/wimax/Kconfig"
 source "net/rfkill/Kconfig"
 source "net/9p/Kconfig"
 source "net/caif/Kconfig"
+source "net/ceph/Kconfig"
 
 
 endif   # if NET
diff --git a/net/Makefile b/net/Makefile
index cb7bdc1210c..6b7bfd7f141 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -50,7 +50,7 @@ endif
 obj-$(CONFIG_IP_DCCP)		+= dccp/
 obj-$(CONFIG_IP_SCTP)		+= sctp/
 obj-$(CONFIG_RDS)		+= rds/
-obj-y				+= wireless/
+obj-$(CONFIG_WIRELESS)		+= wireless/
 obj-$(CONFIG_MAC80211)		+= mac80211/
 obj-$(CONFIG_TIPC)		+= tipc/
 obj-$(CONFIG_NETLABEL)		+= netlabel/
@@ -61,9 +61,11 @@ obj-$(CONFIG_CAIF)		+= caif/
 ifneq ($(CONFIG_DCB),)
 obj-y				+= dcb/
 endif
-obj-y				+= ieee802154/
+obj-$(CONFIG_IEEE802154)	+= ieee802154/
 
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_SYSCTL)		+= sysctl_net.o
 endif
 obj-$(CONFIG_WIMAX)		+= wimax/
+obj-$(CONFIG_DNS_RESOLVER)	+= dns_resolver/
+obj-$(CONFIG_CEPH_LIB)		+= ceph/
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index 6719af6a59f..ad2b232a205 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -139,6 +139,43 @@ static struct net_device *br2684_find_dev(const struct br2684_if_spec *s)
 	return NULL;
 }
 
+static int atm_dev_event(struct notifier_block *this, unsigned long event,
+		 void *arg)
+{
+	struct atm_dev *atm_dev = arg;
+	struct list_head *lh;
+	struct net_device *net_dev;
+	struct br2684_vcc *brvcc;
+	struct atm_vcc *atm_vcc;
+	unsigned long flags;
+
+	pr_debug("event=%ld dev=%p\n", event, atm_dev);
+
+	read_lock_irqsave(&devs_lock, flags);
+	list_for_each(lh, &br2684_devs) {
+		net_dev = list_entry_brdev(lh);
+
+		list_for_each_entry(brvcc, &BRPRIV(net_dev)->brvccs, brvccs) {
+			atm_vcc = brvcc->atmvcc;
+			if (atm_vcc && brvcc->atmvcc->dev == atm_dev) {
+
+				if (atm_vcc->dev->signal == ATM_PHY_SIG_LOST)
+					netif_carrier_off(net_dev);
+				else
+					netif_carrier_on(net_dev);
+
+			}
+		}
+	}
+	read_unlock_irqrestore(&devs_lock, flags);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block atm_dev_notifier = {
+	.notifier_call = atm_dev_event,
+};
+
 /* chained vcc->pop function.  Check if we should wake the netif_queue */
 static void br2684_pop(struct atm_vcc *vcc, struct sk_buff *skb)
 {
@@ -530,6 +567,13 @@ static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg)
 
 		br2684_push(atmvcc, skb);
 	}
+
+	/* initialize netdev carrier state */
+	if (atmvcc->dev->signal == ATM_PHY_SIG_LOST)
+		netif_carrier_off(net_dev);
+	else
+		netif_carrier_on(net_dev);
+
 	__module_get(THIS_MODULE);
 	return 0;
 
@@ -620,9 +664,15 @@ static int br2684_create(void __user *arg)
 	}
 
 	write_lock_irq(&devs_lock);
+
 	brdev->payload = payload;
-	brdev->number = list_empty(&br2684_devs) ? 1 :
-	    BRPRIV(list_entry_brdev(br2684_devs.prev))->number + 1;
+
+	if (list_empty(&br2684_devs)) {
+		/* 1st br2684 device */
+		brdev->number = 1;
+	} else
+		brdev->number = BRPRIV(list_entry_brdev(br2684_devs.prev))->number + 1;
+
 	list_add_tail(&brdev->br2684_devs, &br2684_devs);
 	write_unlock_irq(&devs_lock);
 	return 0;
@@ -758,6 +808,7 @@ static int __init br2684_init(void)
 		return -ENOMEM;
 #endif
 	register_atm_ioctl(&br2684_ioctl_ops);
+	register_atmdevice_notifier(&atm_dev_notifier);
 	return 0;
 }
 
@@ -772,6 +823,9 @@ static void __exit br2684_exit(void)
 	remove_proc_entry("br2684", atm_proc_root);
 #endif
 
+
+	unregister_atmdevice_notifier(&atm_dev_notifier);
+
 	while (!list_empty(&br2684_devs)) {
 		net_dev = list_entry_brdev(br2684_devs.next);
 		brdev = BRPRIV(net_dev);
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 313aba11316..ff956d1115b 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -310,9 +310,9 @@ static int clip_constructor(struct neighbour *neigh)
 	return 0;
 }
 
-static u32 clip_hash(const void *pkey, const struct net_device *dev)
+static u32 clip_hash(const void *pkey, const struct net_device *dev, __u32 rnd)
 {
-	return jhash_2words(*(u32 *) pkey, dev->ifindex, clip_tbl.hash_rnd);
+	return jhash_2words(*(u32 *) pkey, dev->ifindex, rnd);
 }
 
 static struct neigh_table clip_tbl = {
@@ -522,7 +522,7 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip)
 	error = ip_route_output_key(&init_net, &rt, &fl);
 	if (error)
 		return error;
-	neigh = __neigh_lookup(&clip_tbl, &ip, rt->u.dst.dev, 1);
+	neigh = __neigh_lookup(&clip_tbl, &ip, rt->dst.dev, 1);
 	ip_rt_put(rt);
 	if (!neigh)
 		return -ENOMEM;
diff --git a/net/atm/common.c b/net/atm/common.c
index b43feb1a399..1b9c52a02cd 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -37,6 +37,8 @@ EXPORT_SYMBOL(vcc_hash);
 DEFINE_RWLOCK(vcc_sklist_lock);
 EXPORT_SYMBOL(vcc_sklist_lock);
 
+static ATOMIC_NOTIFIER_HEAD(atm_dev_notify_chain);
+
 static void __vcc_insert_socket(struct sock *sk)
 {
 	struct atm_vcc *vcc = atm_sk(sk);
@@ -212,6 +214,22 @@ void vcc_release_async(struct atm_vcc *vcc, int reply)
 }
 EXPORT_SYMBOL(vcc_release_async);
 
+void atm_dev_signal_change(struct atm_dev *dev, char signal)
+{
+	pr_debug("%s signal=%d dev=%p number=%d dev->signal=%d\n",
+		__func__, signal, dev, dev->number, dev->signal);
+
+	/* atm driver sending invalid signal */
+	WARN_ON(signal < ATM_PHY_SIG_LOST || signal > ATM_PHY_SIG_FOUND);
+
+	if (dev->signal == signal)
+		return; /* no change */
+
+	dev->signal = signal;
+
+	atomic_notifier_call_chain(&atm_dev_notify_chain, signal, dev);
+}
+EXPORT_SYMBOL(atm_dev_signal_change);
 
 void atm_dev_release_vccs(struct atm_dev *dev)
 {
@@ -774,13 +792,25 @@ int vcc_getsockopt(struct socket *sock, int level, int optname,
 	default:
 		if (level == SOL_SOCKET)
 			return -EINVAL;
-			break;
+		break;
 	}
 	if (!vcc->dev || !vcc->dev->ops->getsockopt)
 		return -EINVAL;
 	return vcc->dev->ops->getsockopt(vcc, level, optname, optval, len);
 }
 
+int register_atmdevice_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&atm_dev_notify_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_atmdevice_notifier);
+
+void unregister_atmdevice_notifier(struct notifier_block *nb)
+{
+	atomic_notifier_chain_unregister(&atm_dev_notify_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_atmdevice_notifier);
+
 static int __init atm_init(void)
 {
 	int error;
diff --git a/net/atm/lec.c b/net/atm/lec.c
index d98bde1a0ac..181d70c73d7 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -220,7 +220,6 @@ static unsigned char *get_tr_dst(unsigned char *packet, unsigned char *rdesc)
 static int lec_open(struct net_device *dev)
 {
 	netif_start_queue(dev);
-	memset(&dev->stats, 0, sizeof(struct net_device_stats));
 
 	return 0;
 }
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 622b471e14e..74bcc662c3d 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -778,7 +778,7 @@ static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb)
 	eg->packets_rcvd++;
 	mpc->eg_ops->put(eg);
 
-	memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data));
+	memset(ATM_SKB(new_skb), 0, sizeof(struct atm_skb_data));
 	netif_rx(new_skb);
 }
 
diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c
index e49bb6d948a..e9aced0ec56 100644
--- a/net/atm/pppoatm.c
+++ b/net/atm/pppoatm.c
@@ -260,7 +260,7 @@ static int pppoatm_devppp_ioctl(struct ppp_channel *chan, unsigned int cmd,
 	return -ENOTTY;
 }
 
-static /*const*/ struct ppp_channel_ops pppoatm_ops = {
+static const struct ppp_channel_ops pppoatm_ops = {
 	.start_xmit = pppoatm_send,
 	.ioctl = pppoatm_devppp_ioctl,
 };
diff --git a/net/atm/proc.c b/net/atm/proc.c
index 6262aeae398..f85da0779e5 100644
--- a/net/atm/proc.c
+++ b/net/atm/proc.c
@@ -38,6 +38,7 @@ static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
 static const struct file_operations proc_atm_dev_ops = {
 	.owner =	THIS_MODULE,
 	.read =		proc_dev_atm_read,
+	.llseek =	noop_llseek,
 };
 
 static void add_stats(struct seq_file *seq, const char *aal,
diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig
index 2a72aa96a56..705e53ef4af 100644
--- a/net/ax25/Kconfig
+++ b/net/ax25/Kconfig
@@ -7,7 +7,7 @@ menuconfig HAMRADIO
 	bool "Amateur Radio support"
 	help
 	  If you want to connect your Linux box to an amateur radio, answer Y
-	  here. You want to read <http://www.tapr.org/tapr/html/pkthome.html>
+	  here. You want to read <http://www.tapr.org/>
 	  and more specifically about AX.25 on Linux
 	  <http://www.linux-ax25.org/>.
 
@@ -42,7 +42,7 @@ config AX25
 	  check out the file <file:Documentation/networking/ax25.txt> in the
 	  kernel source. More information about digital amateur radio in
 	  general is on the WWW at
-	  <http://www.tapr.org/tapr/html/pkthome.html>.
+	  <http://www.tapr.org/>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called ax25.
@@ -89,7 +89,7 @@ config NETROM
 	  <http://www.linux-ax25.org>. You also might want to check out the
 	  file <file:Documentation/networking/ax25.txt>. More information about
 	  digital amateur radio in general is on the WWW at
-	  <http://www.tapr.org/tapr/html/pkthome.html>.
+	  <http://www.tapr.org/>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called netrom.
@@ -108,7 +108,7 @@ config ROSE
 	  <http://www.linux-ax25.org>.  You also might want to check out the
 	  file <file:Documentation/networking/ax25.txt>. More information about
 	  digital amateur radio in general is on the WWW at
-	  <http://www.tapr.org/tapr/html/pkthome.html>.
+	  <http://www.tapr.org/>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called rose.
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index cfdfd7e2a17..26eaebf4aaa 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1103,7 +1103,7 @@ done:
 out:
 	release_sock(sk);
 
-	return 0;
+	return err;
 }
 
 /*
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index 2ce79df0068..c7d81436213 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -112,8 +112,8 @@ void ax25_ds_heartbeat_expiry(ax25_cb *ax25)
 			if (sk) {
 				sock_hold(sk);
 				ax25_destroy_socket(ax25);
-				sock_put(sk);
 				bh_unlock_sock(sk);
+				sock_put(sk);
 			} else
 				ax25_destroy_socket(ax25);
 			return;
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index 7805945a5fd..a1690845dc6 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -412,7 +412,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
 {
 	ax25_uid_assoc *user;
 	ax25_route *ax25_rt;
-	int err;
+	int err = 0;
 
 	if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL)
 		return -EHOSTUNREACH;
@@ -453,7 +453,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
 put:
 	ax25_put_route(ax25_rt);
 
-	return 0;
+	return err;
 }
 
 struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src,
diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
index ee3b3049d38..ed371684c13 100644
--- a/net/bluetooth/Kconfig
+++ b/net/bluetooth/Kconfig
@@ -43,19 +43,6 @@ config BT_L2CAP
 	  Say Y here to compile L2CAP support into the kernel or say M to
 	  compile it as module (l2cap).
 
-config BT_L2CAP_EXT_FEATURES
-	bool "L2CAP Extended Features support (EXPERIMENTAL)"
-	depends on BT_L2CAP && EXPERIMENTAL
-	help
-	  This option enables the L2CAP Extended Features support. These
-	  new features include the Enhanced Retransmission and Streaming
-	  Modes, the Frame Check Sequence (FCS), and Segmentation and
-	  Reassembly (SAR) for L2CAP packets. They are a required for the
-	  new Alternate MAC/PHY and the Bluetooth Medical Profile.
-
-	  You should say N unless you know what you are doing. Note that
-	  this is in an experimental state yet.
-
 config BT_SCO
 	tristate "SCO links support"
 	depends on BT
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 421c45bd1b9..c4cf3f59500 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -265,6 +265,115 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 }
 EXPORT_SYMBOL(bt_sock_recvmsg);
 
+static long bt_sock_data_wait(struct sock *sk, long timeo)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (!skb_queue_empty(&sk->sk_receive_queue))
+			break;
+
+		if (sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN))
+			break;
+
+		if (signal_pending(current) || !timeo)
+			break;
+
+		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		lock_sock(sk);
+		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+	}
+
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return timeo;
+}
+
+int bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
+			       struct msghdr *msg, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+	size_t target, copied = 0;
+	long timeo;
+
+	if (flags & MSG_OOB)
+		return -EOPNOTSUPP;
+
+	msg->msg_namelen = 0;
+
+	BT_DBG("sk %p size %zu", sk, size);
+
+	lock_sock(sk);
+
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
+	timeo  = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+	do {
+		struct sk_buff *skb;
+		int chunk;
+
+		skb = skb_dequeue(&sk->sk_receive_queue);
+		if (!skb) {
+			if (copied >= target)
+				break;
+
+			if ((err = sock_error(sk)) != 0)
+				break;
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				break;
+
+			err = -EAGAIN;
+			if (!timeo)
+				break;
+
+			timeo = bt_sock_data_wait(sk, timeo);
+
+			if (signal_pending(current)) {
+				err = sock_intr_errno(timeo);
+				goto out;
+			}
+			continue;
+		}
+
+		chunk = min_t(unsigned int, skb->len, size);
+		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
+			skb_queue_head(&sk->sk_receive_queue, skb);
+			if (!copied)
+				copied = -EFAULT;
+			break;
+		}
+		copied += chunk;
+		size   -= chunk;
+
+		sock_recv_ts_and_drops(msg, sk, skb);
+
+		if (!(flags & MSG_PEEK)) {
+			skb_pull(skb, chunk);
+			if (skb->len) {
+				skb_queue_head(&sk->sk_receive_queue, skb);
+				break;
+			}
+			kfree_skb(skb);
+
+		} else {
+			/* put message back and return */
+			skb_queue_head(&sk->sk_receive_queue, skb);
+			break;
+		}
+	} while (size);
+
+out:
+	release_sock(sk);
+	return copied ? : err;
+}
+EXPORT_SYMBOL(bt_sock_stream_recvmsg);
+
 static inline unsigned int bt_accept_poll(struct sock *parent)
 {
 	struct list_head *p, *n;
@@ -297,13 +406,12 @@ unsigned int bt_sock_poll(struct file * file, struct socket *sock, poll_table *w
 		mask |= POLLERR;
 
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLRDHUP;
+		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
 
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
 		mask |= POLLHUP;
 
-	if (!skb_queue_empty(&sk->sk_receive_queue) ||
-			(sk->sk_shutdown & RCV_SHUTDOWN))
+	if (!skb_queue_empty(&sk->sk_receive_queue))
 		mask |= POLLIN | POLLRDNORM;
 
 	if (sk->sk_state == BT_CLOSED)
diff --git a/net/bluetooth/bnep/bnep.h b/net/bluetooth/bnep/bnep.h
index 0d9e506f5d5..70672544db8 100644
--- a/net/bluetooth/bnep/bnep.h
+++ b/net/bluetooth/bnep/bnep.h
@@ -86,26 +86,26 @@ struct bnep_setup_conn_req {
 	__u8  ctrl;
 	__u8  uuid_size;
 	__u8  service[0];
-} __attribute__((packed));
+} __packed;
 
 struct bnep_set_filter_req {
 	__u8  type;
 	__u8  ctrl;
 	__be16 len;
 	__u8  list[0];
-} __attribute__((packed));
+} __packed;
 
 struct bnep_control_rsp {
 	__u8  type;
 	__u8  ctrl;
 	__be16 resp;
-} __attribute__((packed));
+} __packed;
 
 struct bnep_ext_hdr {
 	__u8  type;
 	__u8  len;
 	__u8  data[0];
-} __attribute__((packed));
+} __packed;
 
 /* BNEP ioctl defines */
 #define BNEPCONNADD	_IOW('B', 200, int)
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index d4c6af082d4..ec0a1347f93 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -321,14 +321,10 @@ static int cmtp_session(void *arg)
 int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock)
 {
 	struct cmtp_session *session, *s;
-	bdaddr_t src, dst;
 	int i, err;
 
 	BT_DBG("");
 
-	baswap(&src, &bt_sk(sock->sk)->src);
-	baswap(&dst, &bt_sk(sock->sk)->dst);
-
 	session = kzalloc(sizeof(struct cmtp_session), GFP_KERNEL);
 	if (!session)
 		return -ENOMEM;
@@ -347,7 +343,7 @@ int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock)
 
 	BT_DBG("mtu %d", session->mtu);
 
-	sprintf(session->name, "%s", batostr(&dst));
+	sprintf(session->name, "%s", batostr(&bt_sk(sock->sk)->dst));
 
 	session->sock  = sock;
 	session->state = BT_CONFIG;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 800b6b9fbba..0b1e460fe44 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1,6 +1,6 @@
 /*
    BlueZ - Bluetooth protocol stack for Linux
-   Copyright (C) 2000-2001 Qualcomm Incorporated
+   Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
 
    Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
 
@@ -155,6 +155,27 @@ void hci_setup_sync(struct hci_conn *conn, __u16 handle)
 	hci_send_cmd(hdev, HCI_OP_SETUP_SYNC_CONN, sizeof(cp), &cp);
 }
 
+/* Device _must_ be locked */
+void hci_sco_setup(struct hci_conn *conn, __u8 status)
+{
+	struct hci_conn *sco = conn->link;
+
+	BT_DBG("%p", conn);
+
+	if (!sco)
+		return;
+
+	if (!status) {
+		if (lmp_esco_capable(conn->hdev))
+			hci_setup_sync(sco, conn->handle);
+		else
+			hci_add_sco(sco, conn->handle);
+	} else {
+		hci_proto_connect_cfm(sco, status);
+		hci_conn_del(sco);
+	}
+}
+
 static void hci_conn_timeout(unsigned long arg)
 {
 	struct hci_conn *conn = (void *) arg;
@@ -385,10 +406,13 @@ struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8
 		acl->power_save = 1;
 		hci_conn_enter_active_mode(acl);
 
-		if (lmp_esco_capable(hdev))
-			hci_setup_sync(sco, acl->handle);
-		else
-			hci_add_sco(sco, acl->handle);
+		if (test_bit(HCI_CONN_MODE_CHANGE_PEND, &acl->pend)) {
+			/* defer SCO setup until mode change completed */
+			set_bit(HCI_CONN_SCO_SETUP_PEND, &acl->pend);
+			return sco;
+		}
+
+		hci_sco_setup(acl, 0x00);
 	}
 
 	return sco;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 2f768de8701..bc2a052e518 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -913,7 +913,7 @@ int hci_register_dev(struct hci_dev *hdev)
 	skb_queue_head_init(&hdev->cmd_q);
 	skb_queue_head_init(&hdev->raw_q);
 
-	for (i = 0; i < 3; i++)
+	for (i = 0; i < NUM_REASSEMBLY; i++)
 		hdev->reassembly[i] = NULL;
 
 	init_waitqueue_head(&hdev->req_wait_q);
@@ -923,6 +923,8 @@ int hci_register_dev(struct hci_dev *hdev)
 
 	hci_conn_hash_init(hdev);
 
+	INIT_LIST_HEAD(&hdev->blacklist);
+
 	memset(&hdev->stat, 0, sizeof(struct hci_dev_stats));
 
 	atomic_set(&hdev->promisc, 0);
@@ -970,7 +972,7 @@ int hci_unregister_dev(struct hci_dev *hdev)
 
 	hci_dev_do_close(hdev);
 
-	for (i = 0; i < 3; i++)
+	for (i = 0; i < NUM_REASSEMBLY; i++)
 		kfree_skb(hdev->reassembly[i]);
 
 	hci_notify(hdev, HCI_DEV_UNREG);
@@ -1030,89 +1032,170 @@ int hci_recv_frame(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(hci_recv_frame);
 
-/* Receive packet type fragment */
-#define __reassembly(hdev, type)  ((hdev)->reassembly[(type) - 2])
-
-int hci_recv_fragment(struct hci_dev *hdev, int type, void *data, int count)
+static int hci_reassembly(struct hci_dev *hdev, int type, void *data,
+			  int count, __u8 index, gfp_t gfp_mask)
 {
-	if (type < HCI_ACLDATA_PKT || type > HCI_EVENT_PKT)
+	int len = 0;
+	int hlen = 0;
+	int remain = count;
+	struct sk_buff *skb;
+	struct bt_skb_cb *scb;
+
+	if ((type < HCI_ACLDATA_PKT || type > HCI_EVENT_PKT) ||
+				index >= NUM_REASSEMBLY)
 		return -EILSEQ;
 
+	skb = hdev->reassembly[index];
+
+	if (!skb) {
+		switch (type) {
+		case HCI_ACLDATA_PKT:
+			len = HCI_MAX_FRAME_SIZE;
+			hlen = HCI_ACL_HDR_SIZE;
+			break;
+		case HCI_EVENT_PKT:
+			len = HCI_MAX_EVENT_SIZE;
+			hlen = HCI_EVENT_HDR_SIZE;
+			break;
+		case HCI_SCODATA_PKT:
+			len = HCI_MAX_SCO_SIZE;
+			hlen = HCI_SCO_HDR_SIZE;
+			break;
+		}
+
+		skb = bt_skb_alloc(len, gfp_mask);
+		if (!skb)
+			return -ENOMEM;
+
+		scb = (void *) skb->cb;
+		scb->expect = hlen;
+		scb->pkt_type = type;
+
+		skb->dev = (void *) hdev;
+		hdev->reassembly[index] = skb;
+	}
+
 	while (count) {
-		struct sk_buff *skb = __reassembly(hdev, type);
-		struct { int expect; } *scb;
-		int len = 0;
+		scb = (void *) skb->cb;
+		len = min(scb->expect, (__u16)count);
 
-		if (!skb) {
-			/* Start of the frame */
+		memcpy(skb_put(skb, len), data, len);
 
-			switch (type) {
-			case HCI_EVENT_PKT:
-				if (count >= HCI_EVENT_HDR_SIZE) {
-					struct hci_event_hdr *h = data;
-					len = HCI_EVENT_HDR_SIZE + h->plen;
-				} else
-					return -EILSEQ;
-				break;
+		count -= len;
+		data += len;
+		scb->expect -= len;
+		remain = count;
 
-			case HCI_ACLDATA_PKT:
-				if (count >= HCI_ACL_HDR_SIZE) {
-					struct hci_acl_hdr *h = data;
-					len = HCI_ACL_HDR_SIZE + __le16_to_cpu(h->dlen);
-				} else
-					return -EILSEQ;
-				break;
+		switch (type) {
+		case HCI_EVENT_PKT:
+			if (skb->len == HCI_EVENT_HDR_SIZE) {
+				struct hci_event_hdr *h = hci_event_hdr(skb);
+				scb->expect = h->plen;
+
+				if (skb_tailroom(skb) < scb->expect) {
+					kfree_skb(skb);
+					hdev->reassembly[index] = NULL;
+					return -ENOMEM;
+				}
+			}
+			break;
 
-			case HCI_SCODATA_PKT:
-				if (count >= HCI_SCO_HDR_SIZE) {
-					struct hci_sco_hdr *h = data;
-					len = HCI_SCO_HDR_SIZE + h->dlen;
-				} else
-					return -EILSEQ;
-				break;
+		case HCI_ACLDATA_PKT:
+			if (skb->len  == HCI_ACL_HDR_SIZE) {
+				struct hci_acl_hdr *h = hci_acl_hdr(skb);
+				scb->expect = __le16_to_cpu(h->dlen);
+
+				if (skb_tailroom(skb) < scb->expect) {
+					kfree_skb(skb);
+					hdev->reassembly[index] = NULL;
+					return -ENOMEM;
+				}
 			}
+			break;
 
-			skb = bt_skb_alloc(len, GFP_ATOMIC);
-			if (!skb) {
-				BT_ERR("%s no memory for packet", hdev->name);
-				return -ENOMEM;
+		case HCI_SCODATA_PKT:
+			if (skb->len == HCI_SCO_HDR_SIZE) {
+				struct hci_sco_hdr *h = hci_sco_hdr(skb);
+				scb->expect = h->dlen;
+
+				if (skb_tailroom(skb) < scb->expect) {
+					kfree_skb(skb);
+					hdev->reassembly[index] = NULL;
+					return -ENOMEM;
+				}
 			}
+			break;
+		}
+
+		if (scb->expect == 0) {
+			/* Complete frame */
 
-			skb->dev = (void *) hdev;
 			bt_cb(skb)->pkt_type = type;
+			hci_recv_frame(skb);
 
-			__reassembly(hdev, type) = skb;
+			hdev->reassembly[index] = NULL;
+			return remain;
+		}
+	}
 
-			scb = (void *) skb->cb;
-			scb->expect = len;
-		} else {
-			/* Continuation */
+	return remain;
+}
 
-			scb = (void *) skb->cb;
-			len = scb->expect;
-		}
+int hci_recv_fragment(struct hci_dev *hdev, int type, void *data, int count)
+{
+	int rem = 0;
 
-		len = min(len, count);
+	if (type < HCI_ACLDATA_PKT || type > HCI_EVENT_PKT)
+		return -EILSEQ;
 
-		memcpy(skb_put(skb, len), data, len);
+	while (count) {
+		rem = hci_reassembly(hdev, type, data, count,
+						type - 1, GFP_ATOMIC);
+		if (rem < 0)
+			return rem;
 
-		scb->expect -= len;
+		data += (count - rem);
+		count = rem;
+	};
 
-		if (scb->expect == 0) {
-			/* Complete frame */
+	return rem;
+}
+EXPORT_SYMBOL(hci_recv_fragment);
 
-			__reassembly(hdev, type) = NULL;
+#define STREAM_REASSEMBLY 0
 
-			bt_cb(skb)->pkt_type = type;
-			hci_recv_frame(skb);
-		}
+int hci_recv_stream_fragment(struct hci_dev *hdev, void *data, int count)
+{
+	int type;
+	int rem = 0;
 
-		count -= len; data += len;
-	}
+	while (count) {
+		struct sk_buff *skb = hdev->reassembly[STREAM_REASSEMBLY];
 
-	return 0;
+		if (!skb) {
+			struct { char type; } *pkt;
+
+			/* Start of the frame */
+			pkt = data;
+			type = pkt->type;
+
+			data++;
+			count--;
+		} else
+			type = bt_cb(skb)->pkt_type;
+
+		rem = hci_reassembly(hdev, type, data,
+					count, STREAM_REASSEMBLY, GFP_ATOMIC);
+		if (rem < 0)
+			return rem;
+
+		data += (count - rem);
+		count = rem;
+	};
+
+	return rem;
 }
-EXPORT_SYMBOL(hci_recv_fragment);
+EXPORT_SYMBOL(hci_recv_stream_fragment);
 
 /* ---- Interface to upper protocols ---- */
 
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 786b5de0bac..bfef5bae0b3 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1,6 +1,6 @@
 /*
    BlueZ - Bluetooth protocol stack for Linux
-   Copyright (C) 2000-2001 Qualcomm Incorporated
+   Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
 
    Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
 
@@ -584,7 +584,7 @@ static inline void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
 				conn->out = 1;
 				conn->link_mode |= HCI_LM_MASTER;
 			} else
-				BT_ERR("No memmory for new connection");
+				BT_ERR("No memory for new connection");
 		}
 	}
 
@@ -785,9 +785,13 @@ static void hci_cs_sniff_mode(struct hci_dev *hdev, __u8 status)
 	hci_dev_lock(hdev);
 
 	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
-	if (conn)
+	if (conn) {
 		clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend);
 
+		if (test_and_clear_bit(HCI_CONN_SCO_SETUP_PEND, &conn->pend))
+			hci_sco_setup(conn, status);
+	}
+
 	hci_dev_unlock(hdev);
 }
 
@@ -808,9 +812,13 @@ static void hci_cs_exit_sniff_mode(struct hci_dev *hdev, __u8 status)
 	hci_dev_lock(hdev);
 
 	conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
-	if (conn)
+	if (conn) {
 		clear_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->pend);
 
+		if (test_and_clear_bit(HCI_CONN_SCO_SETUP_PEND, &conn->pend))
+			hci_sco_setup(conn, status);
+	}
+
 	hci_dev_unlock(hdev);
 }
 
@@ -915,20 +923,8 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s
 	} else
 		conn->state = BT_CLOSED;
 
-	if (conn->type == ACL_LINK) {
-		struct hci_conn *sco = conn->link;
-		if (sco) {
-			if (!ev->status) {
-				if (lmp_esco_capable(hdev))
-					hci_setup_sync(sco, conn->handle);
-				else
-					hci_add_sco(sco, conn->handle);
-			} else {
-				hci_proto_connect_cfm(sco, ev->status);
-				hci_conn_del(sco);
-			}
-		}
-	}
+	if (conn->type == ACL_LINK)
+		hci_sco_setup(conn, ev->status);
 
 	if (ev->status) {
 		hci_proto_connect_cfm(conn, ev->status);
@@ -952,7 +948,7 @@ static inline void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *sk
 
 	mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ev->link_type);
 
-	if (mask & HCI_LM_ACCEPT) {
+	if ((mask & HCI_LM_ACCEPT) && !hci_blacklist_lookup(hdev, &ev->bdaddr)) {
 		/* Connection accepted */
 		struct inquiry_entry *ie;
 		struct hci_conn *conn;
@@ -965,7 +961,7 @@ static inline void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *sk
 		conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr);
 		if (!conn) {
 			if (!(conn = hci_conn_add(hdev, ev->link_type, &ev->bdaddr))) {
-				BT_ERR("No memmory for new connection");
+				BT_ERR("No memory for new connection");
 				hci_dev_unlock(hdev);
 				return;
 			}
@@ -1481,6 +1477,9 @@ static inline void hci_mode_change_evt(struct hci_dev *hdev, struct sk_buff *skb
 			else
 				conn->power_save = 0;
 		}
+
+		if (test_and_clear_bit(HCI_CONN_SCO_SETUP_PEND, &conn->pend))
+			hci_sco_setup(conn, ev->status);
 	}
 
 	hci_dev_unlock(hdev);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 38f08f6b86f..83acd164d39 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -165,6 +165,84 @@ static int hci_sock_release(struct socket *sock)
 	return 0;
 }
 
+struct bdaddr_list *hci_blacklist_lookup(struct hci_dev *hdev, bdaddr_t *bdaddr)
+{
+	struct list_head *p;
+
+	list_for_each(p, &hdev->blacklist) {
+		struct bdaddr_list *b;
+
+		b = list_entry(p, struct bdaddr_list, list);
+
+		if (bacmp(bdaddr, &b->bdaddr) == 0)
+			return b;
+	}
+
+	return NULL;
+}
+
+static int hci_blacklist_add(struct hci_dev *hdev, void __user *arg)
+{
+	bdaddr_t bdaddr;
+	struct bdaddr_list *entry;
+
+	if (copy_from_user(&bdaddr, arg, sizeof(bdaddr)))
+		return -EFAULT;
+
+	if (bacmp(&bdaddr, BDADDR_ANY) == 0)
+		return -EBADF;
+
+	if (hci_blacklist_lookup(hdev, &bdaddr))
+		return -EEXIST;
+
+	entry = kzalloc(sizeof(struct bdaddr_list), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	bacpy(&entry->bdaddr, &bdaddr);
+
+	list_add(&entry->list, &hdev->blacklist);
+
+	return 0;
+}
+
+int hci_blacklist_clear(struct hci_dev *hdev)
+{
+	struct list_head *p, *n;
+
+	list_for_each_safe(p, n, &hdev->blacklist) {
+		struct bdaddr_list *b;
+
+		b = list_entry(p, struct bdaddr_list, list);
+
+		list_del(p);
+		kfree(b);
+	}
+
+	return 0;
+}
+
+static int hci_blacklist_del(struct hci_dev *hdev, void __user *arg)
+{
+	bdaddr_t bdaddr;
+	struct bdaddr_list *entry;
+
+	if (copy_from_user(&bdaddr, arg, sizeof(bdaddr)))
+		return -EFAULT;
+
+	if (bacmp(&bdaddr, BDADDR_ANY) == 0)
+		return hci_blacklist_clear(hdev);
+
+	entry = hci_blacklist_lookup(hdev, &bdaddr);
+	if (!entry)
+		return -ENOENT;
+
+	list_del(&entry->list);
+	kfree(entry);
+
+	return 0;
+}
+
 /* Ioctls that require bound socket */
 static inline int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg)
 {
@@ -194,6 +272,16 @@ static inline int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, unsign
 	case HCIGETAUTHINFO:
 		return hci_get_auth_info(hdev, (void __user *) arg);
 
+	case HCIBLOCKADDR:
+		if (!capable(CAP_NET_ADMIN))
+			return -EACCES;
+		return hci_blacklist_add(hdev, (void __user *) arg);
+
+	case HCIUNBLOCKADDR:
+		if (!capable(CAP_NET_ADMIN))
+			return -EACCES;
+		return hci_blacklist_del(hdev, (void __user *) arg);
+
 	default:
 		if (hdev->ioctl)
 			return hdev->ioctl(hdev, cmd, arg);
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 463ffa4fe04..5fce3d6d07b 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -37,9 +37,7 @@ static ssize_t show_link_type(struct device *dev, struct device_attribute *attr,
 static ssize_t show_link_address(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct hci_conn *conn = dev_get_drvdata(dev);
-	bdaddr_t bdaddr;
-	baswap(&bdaddr, &conn->dst);
-	return sprintf(buf, "%s\n", batostr(&bdaddr));
+	return sprintf(buf, "%s\n", batostr(&conn->dst));
 }
 
 static ssize_t show_link_features(struct device *dev, struct device_attribute *attr, char *buf)
@@ -196,8 +194,8 @@ static inline char *host_typetostr(int type)
 	switch (type) {
 	case HCI_BREDR:
 		return "BR/EDR";
-	case HCI_80211:
-		return "802.11";
+	case HCI_AMP:
+		return "AMP";
 	default:
 		return "UNKNOWN";
 	}
@@ -238,9 +236,7 @@ static ssize_t show_class(struct device *dev, struct device_attribute *attr, cha
 static ssize_t show_address(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct hci_dev *hdev = dev_get_drvdata(dev);
-	bdaddr_t bdaddr;
-	baswap(&bdaddr, &hdev->bdaddr);
-	return sprintf(buf, "%s\n", batostr(&bdaddr));
+	return sprintf(buf, "%s\n", batostr(&hdev->bdaddr));
 }
 
 static ssize_t show_features(struct device *dev, struct device_attribute *attr, char *buf)
@@ -408,10 +404,8 @@ static int inquiry_cache_show(struct seq_file *f, void *p)
 
 	for (e = cache->list; e; e = e->next) {
 		struct inquiry_data *data = &e->data;
-		bdaddr_t bdaddr;
-		baswap(&bdaddr, &data->bdaddr);
 		seq_printf(f, "%s %d %d %d 0x%.2x%.2x%.2x 0x%.4x %d %d %u\n",
-			   batostr(&bdaddr),
+			   batostr(&data->bdaddr),
 			   data->pscan_rep_mode, data->pscan_period_mode,
 			   data->pscan_mode, data->dev_class[2],
 			   data->dev_class[1], data->dev_class[0],
@@ -436,6 +430,37 @@ static const struct file_operations inquiry_cache_fops = {
 	.release	= single_release,
 };
 
+static int blacklist_show(struct seq_file *f, void *p)
+{
+	struct hci_dev *hdev = f->private;
+	struct list_head *l;
+
+	hci_dev_lock_bh(hdev);
+
+	list_for_each(l, &hdev->blacklist) {
+		struct bdaddr_list *b;
+
+		b = list_entry(l, struct bdaddr_list, list);
+
+		seq_printf(f, "%s\n", batostr(&b->bdaddr));
+	}
+
+	hci_dev_unlock_bh(hdev);
+
+	return 0;
+}
+
+static int blacklist_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, blacklist_show, inode->i_private);
+}
+
+static const struct file_operations blacklist_fops = {
+	.open		= blacklist_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 int hci_register_sysfs(struct hci_dev *hdev)
 {
 	struct device *dev = &hdev->dev;
@@ -465,6 +490,9 @@ int hci_register_sysfs(struct hci_dev *hdev)
 	debugfs_create_file("inquiry_cache", 0444, hdev->debugfs,
 						hdev, &inquiry_cache_fops);
 
+	debugfs_create_file("blacklist", 0444, hdev->debugfs,
+						hdev, &blacklist_fops);
+
 	return 0;
 }
 
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index bfe641b7dfa..c0ee8b3928e 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -758,7 +758,6 @@ static int hidp_setup_hid(struct hidp_session *session,
 				struct hidp_connadd_req *req)
 {
 	struct hid_device *hid;
-	bdaddr_t src, dst;
 	int err;
 
 	session->rd_data = kzalloc(req->rd_size, GFP_KERNEL);
@@ -781,9 +780,6 @@ static int hidp_setup_hid(struct hidp_session *session,
 
 	hid->driver_data = session;
 
-	baswap(&src, &bt_sk(session->ctrl_sock->sk)->src);
-	baswap(&dst, &bt_sk(session->ctrl_sock->sk)->dst);
-
 	hid->bus     = BUS_BLUETOOTH;
 	hid->vendor  = req->vendor;
 	hid->product = req->product;
@@ -791,8 +787,8 @@ static int hidp_setup_hid(struct hidp_session *session,
 	hid->country = req->country;
 
 	strncpy(hid->name, req->name, 128);
-	strncpy(hid->phys, batostr(&src), 64);
-	strncpy(hid->uniq, batostr(&dst), 64);
+	strncpy(hid->phys, batostr(&bt_sk(session->ctrl_sock->sk)->src), 64);
+	strncpy(hid->uniq, batostr(&bt_sk(session->ctrl_sock->sk)->dst), 64);
 
 	hid->dev.parent = hidp_get_device(session);
 	hid->ll_driver = &hidp_hid_driver;
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index cf3c4073a8a..daa7a988d9a 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -1,6 +1,8 @@
 /*
    BlueZ - Bluetooth protocol stack for Linux
    Copyright (C) 2000-2001 Qualcomm Incorporated
+   Copyright (C) 2009-2010 Gustavo F. Padovan <gustavo@padovan.org>
+   Copyright (C) 2010 Google Inc.
 
    Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
 
@@ -53,15 +55,9 @@
 #include <net/bluetooth/hci_core.h>
 #include <net/bluetooth/l2cap.h>
 
-#define VERSION "2.14"
+#define VERSION "2.15"
 
-#ifdef CONFIG_BT_L2CAP_EXT_FEATURES
-static int enable_ertm = 1;
-#else
-static int enable_ertm = 0;
-#endif
-static int max_transmit = L2CAP_DEFAULT_MAX_TX;
-static int tx_window = L2CAP_DEFAULT_TX_WINDOW;
+static int disable_ertm = 0;
 
 static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN;
 static u8 l2cap_fixed_chan[8] = { 0x02, };
@@ -80,9 +76,12 @@ static void __l2cap_sock_close(struct sock *sk, int reason);
 static void l2cap_sock_close(struct sock *sk);
 static void l2cap_sock_kill(struct sock *sk);
 
+static int l2cap_build_conf_req(struct sock *sk, void *data);
 static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn,
 				u8 code, u8 ident, u16 dlen, void *data);
 
+static int l2cap_ertm_data_rcv(struct sock *sk, struct sk_buff *skb);
+
 /* ---- L2CAP timers ---- */
 static void l2cap_sock_timeout(unsigned long arg)
 {
@@ -278,6 +277,24 @@ static void l2cap_chan_del(struct sock *sk, int err)
 		parent->sk_data_ready(parent, 0);
 	} else
 		sk->sk_state_change(sk);
+
+	skb_queue_purge(TX_QUEUE(sk));
+
+	if (l2cap_pi(sk)->mode == L2CAP_MODE_ERTM) {
+		struct srej_list *l, *tmp;
+
+		del_timer(&l2cap_pi(sk)->retrans_timer);
+		del_timer(&l2cap_pi(sk)->monitor_timer);
+		del_timer(&l2cap_pi(sk)->ack_timer);
+
+		skb_queue_purge(SREJ_QUEUE(sk));
+		skb_queue_purge(BUSY_QUEUE(sk));
+
+		list_for_each_entry_safe(l, tmp, SREJ_LIST(sk), list) {
+			list_del(&l->list);
+			kfree(l);
+		}
+	}
 }
 
 /* Service level security */
@@ -351,8 +368,12 @@ static inline void l2cap_send_sframe(struct l2cap_pinfo *pi, u16 control)
 	struct sk_buff *skb;
 	struct l2cap_hdr *lh;
 	struct l2cap_conn *conn = pi->conn;
+	struct sock *sk = (struct sock *)pi;
 	int count, hlen = L2CAP_HDR_SIZE + 2;
 
+	if (sk->sk_state != BT_CONNECTED)
+		return;
+
 	if (pi->fcs == L2CAP_FCS_CRC16)
 		hlen += 2;
 
@@ -440,24 +461,57 @@ static void l2cap_do_start(struct sock *sk)
 	}
 }
 
-static void l2cap_send_disconn_req(struct l2cap_conn *conn, struct sock *sk)
+static inline int l2cap_mode_supported(__u8 mode, __u32 feat_mask)
+{
+	u32 local_feat_mask = l2cap_feat_mask;
+	if (!disable_ertm)
+		local_feat_mask |= L2CAP_FEAT_ERTM | L2CAP_FEAT_STREAMING;
+
+	switch (mode) {
+	case L2CAP_MODE_ERTM:
+		return L2CAP_FEAT_ERTM & feat_mask & local_feat_mask;
+	case L2CAP_MODE_STREAMING:
+		return L2CAP_FEAT_STREAMING & feat_mask & local_feat_mask;
+	default:
+		return 0x00;
+	}
+}
+
+static void l2cap_send_disconn_req(struct l2cap_conn *conn, struct sock *sk, int err)
 {
 	struct l2cap_disconn_req req;
 
+	if (!conn)
+		return;
+
+	skb_queue_purge(TX_QUEUE(sk));
+
+	if (l2cap_pi(sk)->mode == L2CAP_MODE_ERTM) {
+		del_timer(&l2cap_pi(sk)->retrans_timer);
+		del_timer(&l2cap_pi(sk)->monitor_timer);
+		del_timer(&l2cap_pi(sk)->ack_timer);
+	}
+
 	req.dcid = cpu_to_le16(l2cap_pi(sk)->dcid);
 	req.scid = cpu_to_le16(l2cap_pi(sk)->scid);
 	l2cap_send_cmd(conn, l2cap_get_ident(conn),
 			L2CAP_DISCONN_REQ, sizeof(req), &req);
+
+	sk->sk_state = BT_DISCONN;
+	sk->sk_err = err;
 }
 
 /* ---- L2CAP connections ---- */
 static void l2cap_conn_start(struct l2cap_conn *conn)
 {
 	struct l2cap_chan_list *l = &conn->chan_list;
+	struct sock_del_list del, *tmp1, *tmp2;
 	struct sock *sk;
 
 	BT_DBG("conn %p", conn);
 
+	INIT_LIST_HEAD(&del.list);
+
 	read_lock(&l->lock);
 
 	for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) {
@@ -470,20 +524,38 @@ static void l2cap_conn_start(struct l2cap_conn *conn)
 		}
 
 		if (sk->sk_state == BT_CONNECT) {
-			if (l2cap_check_security(sk) &&
-					__l2cap_no_conn_pending(sk)) {
-				struct l2cap_conn_req req;
-				req.scid = cpu_to_le16(l2cap_pi(sk)->scid);
-				req.psm  = l2cap_pi(sk)->psm;
+			struct l2cap_conn_req req;
 
-				l2cap_pi(sk)->ident = l2cap_get_ident(conn);
-				l2cap_pi(sk)->conf_state |= L2CAP_CONF_CONNECT_PEND;
+			if (!l2cap_check_security(sk) ||
+					!__l2cap_no_conn_pending(sk)) {
+				bh_unlock_sock(sk);
+				continue;
+			}
 
-				l2cap_send_cmd(conn, l2cap_pi(sk)->ident,
-					L2CAP_CONN_REQ, sizeof(req), &req);
+			if (!l2cap_mode_supported(l2cap_pi(sk)->mode,
+					conn->feat_mask)
+					&& l2cap_pi(sk)->conf_state &
+					L2CAP_CONF_STATE2_DEVICE) {
+				tmp1 = kzalloc(sizeof(struct sock_del_list),
+						GFP_ATOMIC);
+				tmp1->sk = sk;
+				list_add_tail(&tmp1->list, &del.list);
+				bh_unlock_sock(sk);
+				continue;
 			}
+
+			req.scid = cpu_to_le16(l2cap_pi(sk)->scid);
+			req.psm  = l2cap_pi(sk)->psm;
+
+			l2cap_pi(sk)->ident = l2cap_get_ident(conn);
+			l2cap_pi(sk)->conf_state |= L2CAP_CONF_CONNECT_PEND;
+
+			l2cap_send_cmd(conn, l2cap_pi(sk)->ident,
+				L2CAP_CONN_REQ, sizeof(req), &req);
+
 		} else if (sk->sk_state == BT_CONNECT2) {
 			struct l2cap_conn_rsp rsp;
+			char buf[128];
 			rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid);
 			rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid);
 
@@ -506,12 +578,31 @@ static void l2cap_conn_start(struct l2cap_conn *conn)
 
 			l2cap_send_cmd(conn, l2cap_pi(sk)->ident,
 					L2CAP_CONN_RSP, sizeof(rsp), &rsp);
+
+			if (l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT ||
+					rsp.result != L2CAP_CR_SUCCESS) {
+				bh_unlock_sock(sk);
+				continue;
+			}
+
+			l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT;
+			l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+						l2cap_build_conf_req(sk, buf), buf);
+			l2cap_pi(sk)->num_conf_req++;
 		}
 
 		bh_unlock_sock(sk);
 	}
 
 	read_unlock(&l->lock);
+
+	list_for_each_entry_safe(tmp1, tmp2, &del.list, list) {
+		bh_lock_sock(tmp1->sk);
+		__l2cap_sock_close(tmp1->sk, ECONNRESET);
+		bh_unlock_sock(tmp1->sk);
+		list_del(&tmp1->list);
+		kfree(tmp1);
+	}
 }
 
 static void l2cap_conn_ready(struct l2cap_conn *conn)
@@ -740,9 +831,8 @@ static void __l2cap_sock_close(struct sock *sk, int reason)
 				sk->sk_type == SOCK_STREAM) {
 			struct l2cap_conn *conn = l2cap_pi(sk)->conn;
 
-			sk->sk_state = BT_DISCONN;
 			l2cap_sock_set_timer(sk, sk->sk_sndtimeo);
-			l2cap_send_disconn_req(conn, sk);
+			l2cap_send_disconn_req(conn, sk, reason);
 		} else
 			l2cap_chan_del(sk, reason);
 		break;
@@ -802,6 +892,7 @@ static void l2cap_sock_init(struct sock *sk, struct sock *parent)
 
 		pi->imtu = l2cap_pi(parent)->imtu;
 		pi->omtu = l2cap_pi(parent)->omtu;
+		pi->conf_state = l2cap_pi(parent)->conf_state;
 		pi->mode = l2cap_pi(parent)->mode;
 		pi->fcs  = l2cap_pi(parent)->fcs;
 		pi->max_tx = l2cap_pi(parent)->max_tx;
@@ -812,13 +903,15 @@ static void l2cap_sock_init(struct sock *sk, struct sock *parent)
 	} else {
 		pi->imtu = L2CAP_DEFAULT_MTU;
 		pi->omtu = 0;
-		if (enable_ertm && sk->sk_type == SOCK_STREAM)
+		if (!disable_ertm && sk->sk_type == SOCK_STREAM) {
 			pi->mode = L2CAP_MODE_ERTM;
-		else
+			pi->conf_state |= L2CAP_CONF_STATE2_DEVICE;
+		} else {
 			pi->mode = L2CAP_MODE_BASIC;
-		pi->max_tx = max_transmit;
+		}
+		pi->max_tx = L2CAP_DEFAULT_MAX_TX;
 		pi->fcs  = L2CAP_FCS_CRC16;
-		pi->tx_win = tx_window;
+		pi->tx_win = L2CAP_DEFAULT_TX_WINDOW;
 		pi->sec_level = BT_SECURITY_LOW;
 		pi->role_switch = 0;
 		pi->force_reliable = 0;
@@ -915,10 +1008,20 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
 		goto done;
 	}
 
-	if (la.l2_psm && __le16_to_cpu(la.l2_psm) < 0x1001 &&
-				!capable(CAP_NET_BIND_SERVICE)) {
-		err = -EACCES;
-		goto done;
+	if (la.l2_psm) {
+		__u16 psm = __le16_to_cpu(la.l2_psm);
+
+		/* PSM must be odd and lsb of upper byte must be 0 */
+		if ((psm & 0x0101) != 0x0001) {
+			err = -EINVAL;
+			goto done;
+		}
+
+		/* Restrict usage of well-known PSMs */
+		if (psm < 0x1001 && !capable(CAP_NET_BIND_SERVICE)) {
+			err = -EACCES;
+			goto done;
+		}
 	}
 
 	write_lock_bh(&l2cap_sk_list.lock);
@@ -1067,7 +1170,7 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int al
 		break;
 	case L2CAP_MODE_ERTM:
 	case L2CAP_MODE_STREAMING:
-		if (enable_ertm)
+		if (!disable_ertm)
 			break;
 		/* fall through */
 	default:
@@ -1084,6 +1187,7 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int al
 
 	case BT_CONNECTED:
 		/* Already connected */
+		err = -EISCONN;
 		goto done;
 
 	case BT_OPEN:
@@ -1096,6 +1200,13 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int al
 		goto done;
 	}
 
+	/* PSM must be odd and lsb of upper byte must be 0 */
+	if ((__le16_to_cpu(la.l2_psm) & 0x0101) != 0x0001 &&
+		sk->sk_type != SOCK_RAW) {
+		err = -EINVAL;
+		goto done;
+	}
+
 	/* Set destination address and psm */
 	bacpy(&bt_sk(sk)->dst, &la.l2_bdaddr);
 	l2cap_pi(sk)->psm = la.l2_psm;
@@ -1132,7 +1243,7 @@ static int l2cap_sock_listen(struct socket *sock, int backlog)
 		break;
 	case L2CAP_MODE_ERTM:
 	case L2CAP_MODE_STREAMING:
-		if (enable_ertm)
+		if (!disable_ertm)
 			break;
 		/* fall through */
 	default:
@@ -1285,9 +1396,11 @@ static void l2cap_monitor_timeout(unsigned long arg)
 {
 	struct sock *sk = (void *) arg;
 
+	BT_DBG("sk %p", sk);
+
 	bh_lock_sock(sk);
 	if (l2cap_pi(sk)->retry_count >= l2cap_pi(sk)->remote_max_tx) {
-		l2cap_send_disconn_req(l2cap_pi(sk)->conn, sk);
+		l2cap_send_disconn_req(l2cap_pi(sk)->conn, sk, ECONNABORTED);
 		bh_unlock_sock(sk);
 		return;
 	}
@@ -1303,6 +1416,8 @@ static void l2cap_retrans_timeout(unsigned long arg)
 {
 	struct sock *sk = (void *) arg;
 
+	BT_DBG("sk %p", sk);
+
 	bh_lock_sock(sk);
 	l2cap_pi(sk)->retry_count = 1;
 	__mod_monitor_timer();
@@ -1341,37 +1456,26 @@ static inline void l2cap_do_send(struct sock *sk, struct sk_buff *skb)
 	hci_send_acl(pi->conn->hcon, skb, 0);
 }
 
-static int l2cap_streaming_send(struct sock *sk)
+static void l2cap_streaming_send(struct sock *sk)
 {
-	struct sk_buff *skb, *tx_skb;
+	struct sk_buff *skb;
 	struct l2cap_pinfo *pi = l2cap_pi(sk);
 	u16 control, fcs;
 
-	while ((skb = sk->sk_send_head)) {
-		tx_skb = skb_clone(skb, GFP_ATOMIC);
-
-		control = get_unaligned_le16(tx_skb->data + L2CAP_HDR_SIZE);
+	while ((skb = skb_dequeue(TX_QUEUE(sk)))) {
+		control = get_unaligned_le16(skb->data + L2CAP_HDR_SIZE);
 		control |= pi->next_tx_seq << L2CAP_CTRL_TXSEQ_SHIFT;
-		put_unaligned_le16(control, tx_skb->data + L2CAP_HDR_SIZE);
+		put_unaligned_le16(control, skb->data + L2CAP_HDR_SIZE);
 
 		if (pi->fcs == L2CAP_FCS_CRC16) {
-			fcs = crc16(0, (u8 *)tx_skb->data, tx_skb->len - 2);
-			put_unaligned_le16(fcs, tx_skb->data + tx_skb->len - 2);
+			fcs = crc16(0, (u8 *)skb->data, skb->len - 2);
+			put_unaligned_le16(fcs, skb->data + skb->len - 2);
 		}
 
-		l2cap_do_send(sk, tx_skb);
+		l2cap_do_send(sk, skb);
 
 		pi->next_tx_seq = (pi->next_tx_seq + 1) % 64;
-
-		if (skb_queue_is_last(TX_QUEUE(sk), skb))
-			sk->sk_send_head = NULL;
-		else
-			sk->sk_send_head = skb_queue_next(TX_QUEUE(sk), skb);
-
-		skb = skb_dequeue(TX_QUEUE(sk));
-		kfree_skb(skb);
 	}
-	return 0;
 }
 
 static void l2cap_retransmit_one_frame(struct sock *sk, u8 tx_seq)
@@ -1395,15 +1499,22 @@ static void l2cap_retransmit_one_frame(struct sock *sk, u8 tx_seq)
 
 	if (pi->remote_max_tx &&
 			bt_cb(skb)->retries == pi->remote_max_tx) {
-		l2cap_send_disconn_req(pi->conn, sk);
+		l2cap_send_disconn_req(pi->conn, sk, ECONNABORTED);
 		return;
 	}
 
 	tx_skb = skb_clone(skb, GFP_ATOMIC);
 	bt_cb(skb)->retries++;
 	control = get_unaligned_le16(tx_skb->data + L2CAP_HDR_SIZE);
+
+	if (pi->conn_state & L2CAP_CONN_SEND_FBIT) {
+		control |= L2CAP_CTRL_FINAL;
+		pi->conn_state &= ~L2CAP_CONN_SEND_FBIT;
+	}
+
 	control |= (pi->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT)
 			| (tx_seq << L2CAP_CTRL_TXSEQ_SHIFT);
+
 	put_unaligned_le16(control, tx_skb->data + L2CAP_HDR_SIZE);
 
 	if (pi->fcs == L2CAP_FCS_CRC16) {
@@ -1421,15 +1532,14 @@ static int l2cap_ertm_send(struct sock *sk)
 	u16 control, fcs;
 	int nsent = 0;
 
-	if (pi->conn_state & L2CAP_CONN_WAIT_F)
-		return 0;
+	if (sk->sk_state != BT_CONNECTED)
+		return -ENOTCONN;
 
-	while ((skb = sk->sk_send_head) && (!l2cap_tx_window_full(sk)) &&
-			!(pi->conn_state & L2CAP_CONN_REMOTE_BUSY)) {
+	while ((skb = sk->sk_send_head) && (!l2cap_tx_window_full(sk))) {
 
 		if (pi->remote_max_tx &&
 				bt_cb(skb)->retries == pi->remote_max_tx) {
-			l2cap_send_disconn_req(pi->conn, sk);
+			l2cap_send_disconn_req(pi->conn, sk, ECONNABORTED);
 			break;
 		}
 
@@ -1438,6 +1548,8 @@ static int l2cap_ertm_send(struct sock *sk)
 		bt_cb(skb)->retries++;
 
 		control = get_unaligned_le16(tx_skb->data + L2CAP_HDR_SIZE);
+		control &= L2CAP_CTRL_SAR;
+
 		if (pi->conn_state & L2CAP_CONN_SEND_FBIT) {
 			control |= L2CAP_CTRL_FINAL;
 			pi->conn_state &= ~L2CAP_CONN_SEND_FBIT;
@@ -1478,16 +1590,11 @@ static int l2cap_retransmit_frames(struct sock *sk)
 	struct l2cap_pinfo *pi = l2cap_pi(sk);
 	int ret;
 
-	spin_lock_bh(&pi->send_lock);
-
 	if (!skb_queue_empty(TX_QUEUE(sk)))
 		sk->sk_send_head = TX_QUEUE(sk)->next;
 
 	pi->next_tx_seq = pi->expected_ack_seq;
 	ret = l2cap_ertm_send(sk);
-
-	spin_unlock_bh(&pi->send_lock);
-
 	return ret;
 }
 
@@ -1495,7 +1602,6 @@ static void l2cap_send_ack(struct l2cap_pinfo *pi)
 {
 	struct sock *sk = (struct sock *)pi;
 	u16 control = 0;
-	int nframes;
 
 	control |= pi->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT;
 
@@ -1506,11 +1612,7 @@ static void l2cap_send_ack(struct l2cap_pinfo *pi)
 		return;
 	}
 
-	spin_lock_bh(&pi->send_lock);
-	nframes = l2cap_ertm_send(sk);
-	spin_unlock_bh(&pi->send_lock);
-
-	if (nframes > 0)
+	if (l2cap_ertm_send(sk) > 0)
 		return;
 
 	control |= L2CAP_SUPER_RCV_READY;
@@ -1550,7 +1652,7 @@ static inline int l2cap_skbuff_fromiovec(struct sock *sk, struct msghdr *msg, in
 
 		*frag = bt_skb_send_alloc(sk, count, msg->msg_flags & MSG_DONTWAIT, &err);
 		if (!*frag)
-			return -EFAULT;
+			return err;
 		if (memcpy_fromiovec(skb_put(*frag, count), msg->msg_iov, count))
 			return -EFAULT;
 
@@ -1576,7 +1678,7 @@ static struct sk_buff *l2cap_create_connless_pdu(struct sock *sk, struct msghdr
 	skb = bt_skb_send_alloc(sk, count + hlen,
 			msg->msg_flags & MSG_DONTWAIT, &err);
 	if (!skb)
-		return ERR_PTR(-ENOMEM);
+		return ERR_PTR(err);
 
 	/* Create L2CAP header */
 	lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
@@ -1605,7 +1707,7 @@ static struct sk_buff *l2cap_create_basic_pdu(struct sock *sk, struct msghdr *ms
 	skb = bt_skb_send_alloc(sk, count + hlen,
 			msg->msg_flags & MSG_DONTWAIT, &err);
 	if (!skb)
-		return ERR_PTR(-ENOMEM);
+		return ERR_PTR(err);
 
 	/* Create L2CAP header */
 	lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
@@ -1642,7 +1744,7 @@ static struct sk_buff *l2cap_create_iframe_pdu(struct sock *sk, struct msghdr *m
 	skb = bt_skb_send_alloc(sk, count + hlen,
 			msg->msg_flags & MSG_DONTWAIT, &err);
 	if (!skb)
-		return ERR_PTR(-ENOMEM);
+		return ERR_PTR(err);
 
 	/* Create L2CAP header */
 	lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE);
@@ -1705,10 +1807,8 @@ static inline int l2cap_sar_segment_sdu(struct sock *sk, struct msghdr *msg, siz
 		size += buflen;
 	}
 	skb_queue_splice_tail(&sar_queue, TX_QUEUE(sk));
-	spin_lock_bh(&pi->send_lock);
 	if (sk->sk_send_head == NULL)
 		sk->sk_send_head = sar_queue.next;
-	spin_unlock_bh(&pi->send_lock);
 
 	return size;
 }
@@ -1753,7 +1853,7 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms
 	case L2CAP_MODE_BASIC:
 		/* Check outgoing MTU */
 		if (len > pi->omtu) {
-			err = -EINVAL;
+			err = -EMSGSIZE;
 			goto done;
 		}
 
@@ -1780,14 +1880,9 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms
 			}
 			__skb_queue_tail(TX_QUEUE(sk), skb);
 
-			if (pi->mode == L2CAP_MODE_ERTM)
-				spin_lock_bh(&pi->send_lock);
-
 			if (sk->sk_send_head == NULL)
 				sk->sk_send_head = skb;
 
-			if (pi->mode == L2CAP_MODE_ERTM)
-				spin_unlock_bh(&pi->send_lock);
 		} else {
 		/* Segment SDU into multiples PDUs */
 			err = l2cap_sar_segment_sdu(sk, msg, len);
@@ -1796,11 +1891,14 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms
 		}
 
 		if (pi->mode == L2CAP_MODE_STREAMING) {
-			err = l2cap_streaming_send(sk);
+			l2cap_streaming_send(sk);
 		} else {
-			spin_lock_bh(&pi->send_lock);
+			if (pi->conn_state & L2CAP_CONN_REMOTE_BUSY &&
+					pi->conn_state && L2CAP_CONN_WAIT_F) {
+				err = len;
+				break;
+			}
 			err = l2cap_ertm_send(sk);
-			spin_unlock_bh(&pi->send_lock);
 		}
 
 		if (err >= 0)
@@ -1809,7 +1907,7 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms
 
 	default:
 		BT_DBG("bad state %1.1x", pi->mode);
-		err = -EINVAL;
+		err = -EBADFD;
 	}
 
 done:
@@ -1825,6 +1923,8 @@ static int l2cap_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct ms
 
 	if (sk->sk_state == BT_CONNECT2 && bt_sk(sk)->defer_setup) {
 		struct l2cap_conn_rsp rsp;
+		struct l2cap_conn *conn = l2cap_pi(sk)->conn;
+		u8 buf[128];
 
 		sk->sk_state = BT_CONFIG;
 
@@ -1835,12 +1935,25 @@ static int l2cap_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct ms
 		l2cap_send_cmd(l2cap_pi(sk)->conn, l2cap_pi(sk)->ident,
 					L2CAP_CONN_RSP, sizeof(rsp), &rsp);
 
+		if (l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT) {
+			release_sock(sk);
+			return 0;
+		}
+
+		l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT;
+		l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+				l2cap_build_conf_req(sk, buf), buf);
+		l2cap_pi(sk)->num_conf_req++;
+
 		release_sock(sk);
 		return 0;
 	}
 
 	release_sock(sk);
 
+	if (sock->type == SOCK_STREAM)
+		return bt_sock_stream_recvmsg(iocb, sock, msg, len, flags);
+
 	return bt_sock_recvmsg(iocb, sock, msg, len, flags);
 }
 
@@ -1857,6 +1970,11 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, char __us
 
 	switch (optname) {
 	case L2CAP_OPTIONS:
+		if (sk->sk_state == BT_CONNECTED) {
+			err = -EINVAL;
+			break;
+		}
+
 		opts.imtu     = l2cap_pi(sk)->imtu;
 		opts.omtu     = l2cap_pi(sk)->omtu;
 		opts.flush_to = l2cap_pi(sk)->flush_to;
@@ -1871,13 +1989,19 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, char __us
 			break;
 		}
 
+		if (opts.txwin_size > L2CAP_DEFAULT_TX_WINDOW) {
+			err = -EINVAL;
+			break;
+		}
+
 		l2cap_pi(sk)->mode = opts.mode;
 		switch (l2cap_pi(sk)->mode) {
 		case L2CAP_MODE_BASIC:
+			l2cap_pi(sk)->conf_state &= ~L2CAP_CONF_STATE2_DEVICE;
 			break;
 		case L2CAP_MODE_ERTM:
 		case L2CAP_MODE_STREAMING:
-			if (enable_ertm)
+			if (!disable_ertm)
 				break;
 			/* fall through */
 		default:
@@ -2145,6 +2269,10 @@ static int l2cap_sock_shutdown(struct socket *sock, int how)
 			err = bt_sock_wait_state(sk, BT_CLOSED,
 							sk->sk_lingertime);
 	}
+
+	if (!err && sk->sk_err)
+		err = -sk->sk_err;
+
 	release_sock(sk);
 	return err;
 }
@@ -2365,25 +2493,10 @@ static inline void l2cap_ertm_init(struct sock *sk)
 
 	__skb_queue_head_init(SREJ_QUEUE(sk));
 	__skb_queue_head_init(BUSY_QUEUE(sk));
-	spin_lock_init(&l2cap_pi(sk)->send_lock);
 
 	INIT_WORK(&l2cap_pi(sk)->busy_work, l2cap_busy_work);
-}
 
-static int l2cap_mode_supported(__u8 mode, __u32 feat_mask)
-{
-	u32 local_feat_mask = l2cap_feat_mask;
-	if (enable_ertm)
-		local_feat_mask |= L2CAP_FEAT_ERTM | L2CAP_FEAT_STREAMING;
-
-	switch (mode) {
-	case L2CAP_MODE_ERTM:
-		return L2CAP_FEAT_ERTM & feat_mask & local_feat_mask;
-	case L2CAP_MODE_STREAMING:
-		return L2CAP_FEAT_STREAMING & feat_mask & local_feat_mask;
-	default:
-		return 0x00;
-	}
+	sk->sk_backlog_rcv = l2cap_ertm_data_rcv;
 }
 
 static inline __u8 l2cap_select_mode(__u8 mode, __u16 remote_feat_mask)
@@ -2414,10 +2527,10 @@ static int l2cap_build_conf_req(struct sock *sk, void *data)
 	switch (pi->mode) {
 	case L2CAP_MODE_STREAMING:
 	case L2CAP_MODE_ERTM:
-		pi->conf_state |= L2CAP_CONF_STATE2_DEVICE;
-		if (!l2cap_mode_supported(pi->mode, pi->conn->feat_mask))
-			l2cap_send_disconn_req(pi->conn, sk);
-		break;
+		if (pi->conf_state & L2CAP_CONF_STATE2_DEVICE)
+			break;
+
+		/* fall through */
 	default:
 		pi->mode = l2cap_select_mode(rfc.mode, pi->conn->feat_mask);
 		break;
@@ -2428,6 +2541,20 @@ done:
 	case L2CAP_MODE_BASIC:
 		if (pi->imtu != L2CAP_DEFAULT_MTU)
 			l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, pi->imtu);
+
+		if (!(pi->conn->feat_mask & L2CAP_FEAT_ERTM) &&
+				!(pi->conn->feat_mask & L2CAP_FEAT_STREAMING))
+			break;
+
+		rfc.mode            = L2CAP_MODE_BASIC;
+		rfc.txwin_size      = 0;
+		rfc.max_transmit    = 0;
+		rfc.retrans_timeout = 0;
+		rfc.monitor_timeout = 0;
+		rfc.max_pdu_size    = 0;
+
+		l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+							(unsigned long) &rfc);
 		break;
 
 	case L2CAP_MODE_ERTM:
@@ -2440,8 +2567,8 @@ done:
 		if (L2CAP_DEFAULT_MAX_PDU_SIZE > pi->conn->mtu - 10)
 			rfc.max_pdu_size = cpu_to_le16(pi->conn->mtu - 10);
 
-		l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
-					sizeof(rfc), (unsigned long) &rfc);
+		l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+							(unsigned long) &rfc);
 
 		if (!(pi->conn->feat_mask & L2CAP_FEAT_FCS))
 			break;
@@ -2463,8 +2590,8 @@ done:
 		if (L2CAP_DEFAULT_MAX_PDU_SIZE > pi->conn->mtu - 10)
 			rfc.max_pdu_size = cpu_to_le16(pi->conn->mtu - 10);
 
-		l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
-					sizeof(rfc), (unsigned long) &rfc);
+		l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+							(unsigned long) &rfc);
 
 		if (!(pi->conn->feat_mask & L2CAP_FEAT_FCS))
 			break;
@@ -2541,18 +2668,21 @@ static int l2cap_parse_conf_req(struct sock *sk, void *data)
 		}
 	}
 
-	if (pi->num_conf_rsp || pi->num_conf_req)
+	if (pi->num_conf_rsp || pi->num_conf_req > 1)
 		goto done;
 
 	switch (pi->mode) {
 	case L2CAP_MODE_STREAMING:
 	case L2CAP_MODE_ERTM:
-		pi->conf_state |= L2CAP_CONF_STATE2_DEVICE;
-		if (!l2cap_mode_supported(pi->mode, pi->conn->feat_mask))
+		if (!(pi->conf_state & L2CAP_CONF_STATE2_DEVICE)) {
+			pi->mode = l2cap_select_mode(rfc.mode,
+					pi->conn->feat_mask);
+			break;
+		}
+
+		if (pi->mode != rfc.mode)
 			return -ECONNREFUSED;
-		break;
-	default:
-		pi->mode = l2cap_select_mode(rfc.mode, pi->conn->feat_mask);
+
 		break;
 	}
 
@@ -2590,8 +2720,9 @@ done:
 		case L2CAP_MODE_ERTM:
 			pi->remote_tx_win = rfc.txwin_size;
 			pi->remote_max_tx = rfc.max_transmit;
-			if (rfc.max_pdu_size > pi->conn->mtu - 10)
-				rfc.max_pdu_size = le16_to_cpu(pi->conn->mtu - 10);
+
+			if (le16_to_cpu(rfc.max_pdu_size) > pi->conn->mtu - 10)
+				rfc.max_pdu_size = cpu_to_le16(pi->conn->mtu - 10);
 
 			pi->remote_mps = le16_to_cpu(rfc.max_pdu_size);
 
@@ -2608,8 +2739,8 @@ done:
 			break;
 
 		case L2CAP_MODE_STREAMING:
-			if (rfc.max_pdu_size > pi->conn->mtu - 10)
-				rfc.max_pdu_size = le16_to_cpu(pi->conn->mtu - 10);
+			if (le16_to_cpu(rfc.max_pdu_size) > pi->conn->mtu - 10)
+				rfc.max_pdu_size = cpu_to_le16(pi->conn->mtu - 10);
 
 			pi->remote_mps = le16_to_cpu(rfc.max_pdu_size);
 
@@ -2655,10 +2786,10 @@ static int l2cap_parse_conf_rsp(struct sock *sk, void *rsp, int len, void *data,
 		case L2CAP_CONF_MTU:
 			if (val < L2CAP_DEFAULT_MIN_MTU) {
 				*result = L2CAP_CONF_UNACCEPT;
-				pi->omtu = L2CAP_DEFAULT_MIN_MTU;
+				pi->imtu = L2CAP_DEFAULT_MIN_MTU;
 			} else
-				pi->omtu = val;
-			l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, pi->omtu);
+				pi->imtu = val;
+			l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, pi->imtu);
 			break;
 
 		case L2CAP_CONF_FLUSH_TO:
@@ -2675,7 +2806,6 @@ static int l2cap_parse_conf_rsp(struct sock *sk, void *rsp, int len, void *data,
 							rfc.mode != pi->mode)
 				return -ECONNREFUSED;
 
-			pi->mode = rfc.mode;
 			pi->fcs = 0;
 
 			l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
@@ -2684,10 +2814,14 @@ static int l2cap_parse_conf_rsp(struct sock *sk, void *rsp, int len, void *data,
 		}
 	}
 
+	if (pi->mode == L2CAP_MODE_BASIC && pi->mode != rfc.mode)
+		return -ECONNREFUSED;
+
+	pi->mode = rfc.mode;
+
 	if (*result == L2CAP_CONF_SUCCESS) {
 		switch (rfc.mode) {
 		case L2CAP_MODE_ERTM:
-			pi->remote_tx_win   = rfc.txwin_size;
 			pi->retrans_timeout = le16_to_cpu(rfc.retrans_timeout);
 			pi->monitor_timeout = le16_to_cpu(rfc.monitor_timeout);
 			pi->mps    = le16_to_cpu(rfc.max_pdu_size);
@@ -2743,7 +2877,6 @@ static void l2cap_conf_rfc_get(struct sock *sk, void *rsp, int len)
 done:
 	switch (rfc.mode) {
 	case L2CAP_MODE_ERTM:
-		pi->remote_tx_win   = rfc.txwin_size;
 		pi->retrans_timeout = le16_to_cpu(rfc.retrans_timeout);
 		pi->monitor_timeout = le16_to_cpu(rfc.monitor_timeout);
 		pi->mps    = le16_to_cpu(rfc.max_pdu_size);
@@ -2778,7 +2911,7 @@ static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hd
 	struct l2cap_chan_list *list = &conn->chan_list;
 	struct l2cap_conn_req *req = (struct l2cap_conn_req *) data;
 	struct l2cap_conn_rsp rsp;
-	struct sock *sk, *parent;
+	struct sock *parent, *sk = NULL;
 	int result, status = L2CAP_CS_NO_INFO;
 
 	u16 dcid = 0, scid = __le16_to_cpu(req->scid);
@@ -2887,6 +3020,15 @@ sendresp:
 					L2CAP_INFO_REQ, sizeof(info), &info);
 	}
 
+	if (sk && !(l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT) &&
+				result == L2CAP_CR_SUCCESS) {
+		u8 buf[128];
+		l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT;
+		l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
+					l2cap_build_conf_req(sk, buf), buf);
+		l2cap_pi(sk)->num_conf_req++;
+	}
+
 	return 0;
 }
 
@@ -2907,11 +3049,11 @@ static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hd
 	if (scid) {
 		sk = l2cap_get_chan_by_scid(&conn->chan_list, scid);
 		if (!sk)
-			return 0;
+			return -EFAULT;
 	} else {
 		sk = l2cap_get_chan_by_ident(&conn->chan_list, cmd->ident);
 		if (!sk)
-			return 0;
+			return -EFAULT;
 	}
 
 	switch (result) {
@@ -2919,9 +3061,13 @@ static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hd
 		sk->sk_state = BT_CONFIG;
 		l2cap_pi(sk)->ident = 0;
 		l2cap_pi(sk)->dcid = dcid;
-		l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT;
 		l2cap_pi(sk)->conf_state &= ~L2CAP_CONF_CONNECT_PEND;
 
+		if (l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT)
+			break;
+
+		l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT;
+
 		l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
 					l2cap_build_conf_req(sk, req), req);
 		l2cap_pi(sk)->num_conf_req++;
@@ -2940,6 +3086,17 @@ static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hd
 	return 0;
 }
 
+static inline void set_default_fcs(struct l2cap_pinfo *pi)
+{
+	/* FCS is enabled only in ERTM or streaming mode, if one or both
+	 * sides request it.
+	 */
+	if (pi->mode != L2CAP_MODE_ERTM && pi->mode != L2CAP_MODE_STREAMING)
+		pi->fcs = L2CAP_FCS_NONE;
+	else if (!(pi->conf_state & L2CAP_CONF_NO_FCS_RECV))
+		pi->fcs = L2CAP_FCS_CRC16;
+}
+
 static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data)
 {
 	struct l2cap_conf_req *req = (struct l2cap_conf_req *) data;
@@ -2984,7 +3141,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr
 	/* Complete config. */
 	len = l2cap_parse_conf_req(sk, rsp);
 	if (len < 0) {
-		l2cap_send_disconn_req(conn, sk);
+		l2cap_send_disconn_req(conn, sk, ECONNRESET);
 		goto unlock;
 	}
 
@@ -2998,9 +3155,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr
 		goto unlock;
 
 	if (l2cap_pi(sk)->conf_state & L2CAP_CONF_INPUT_DONE) {
-		if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_NO_FCS_RECV) ||
-		    l2cap_pi(sk)->fcs != L2CAP_FCS_NONE)
-			l2cap_pi(sk)->fcs = L2CAP_FCS_CRC16;
+		set_default_fcs(l2cap_pi(sk));
 
 		sk->sk_state = BT_CONNECTED;
 
@@ -3016,6 +3171,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr
 
 	if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT)) {
 		u8 buf[64];
+		l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT;
 		l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
 					l2cap_build_conf_req(sk, buf), buf);
 		l2cap_pi(sk)->num_conf_req++;
@@ -3054,7 +3210,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr
 			char req[64];
 
 			if (len > sizeof(req) - sizeof(struct l2cap_conf_req)) {
-				l2cap_send_disconn_req(conn, sk);
+				l2cap_send_disconn_req(conn, sk, ECONNRESET);
 				goto done;
 			}
 
@@ -3063,7 +3219,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr
 			len = l2cap_parse_conf_rsp(sk, rsp->data,
 							len, req, &result);
 			if (len < 0) {
-				l2cap_send_disconn_req(conn, sk);
+				l2cap_send_disconn_req(conn, sk, ECONNRESET);
 				goto done;
 			}
 
@@ -3076,10 +3232,9 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr
 		}
 
 	default:
-		sk->sk_state = BT_DISCONN;
 		sk->sk_err = ECONNRESET;
 		l2cap_sock_set_timer(sk, HZ * 5);
-		l2cap_send_disconn_req(conn, sk);
+		l2cap_send_disconn_req(conn, sk, ECONNRESET);
 		goto done;
 	}
 
@@ -3089,9 +3244,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr
 	l2cap_pi(sk)->conf_state |= L2CAP_CONF_INPUT_DONE;
 
 	if (l2cap_pi(sk)->conf_state & L2CAP_CONF_OUTPUT_DONE) {
-		if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_NO_FCS_RECV) ||
-		    l2cap_pi(sk)->fcs != L2CAP_FCS_NONE)
-			l2cap_pi(sk)->fcs = L2CAP_FCS_CRC16;
+		set_default_fcs(l2cap_pi(sk));
 
 		sk->sk_state = BT_CONNECTED;
 		l2cap_pi(sk)->next_tx_seq = 0;
@@ -3130,16 +3283,6 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn, struct l2cap_cmd
 
 	sk->sk_shutdown = SHUTDOWN_MASK;
 
-	skb_queue_purge(TX_QUEUE(sk));
-
-	if (l2cap_pi(sk)->mode == L2CAP_MODE_ERTM) {
-		skb_queue_purge(SREJ_QUEUE(sk));
-		skb_queue_purge(BUSY_QUEUE(sk));
-		del_timer(&l2cap_pi(sk)->retrans_timer);
-		del_timer(&l2cap_pi(sk)->monitor_timer);
-		del_timer(&l2cap_pi(sk)->ack_timer);
-	}
-
 	l2cap_chan_del(sk, ECONNRESET);
 	bh_unlock_sock(sk);
 
@@ -3162,16 +3305,6 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, struct l2cap_cmd
 	if (!sk)
 		return 0;
 
-	skb_queue_purge(TX_QUEUE(sk));
-
-	if (l2cap_pi(sk)->mode == L2CAP_MODE_ERTM) {
-		skb_queue_purge(SREJ_QUEUE(sk));
-		skb_queue_purge(BUSY_QUEUE(sk));
-		del_timer(&l2cap_pi(sk)->retrans_timer);
-		del_timer(&l2cap_pi(sk)->monitor_timer);
-		del_timer(&l2cap_pi(sk)->ack_timer);
-	}
-
 	l2cap_chan_del(sk, 0);
 	bh_unlock_sock(sk);
 
@@ -3194,7 +3327,7 @@ static inline int l2cap_information_req(struct l2cap_conn *conn, struct l2cap_cm
 		struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) buf;
 		rsp->type   = cpu_to_le16(L2CAP_IT_FEAT_MASK);
 		rsp->result = cpu_to_le16(L2CAP_IR_SUCCESS);
-		if (enable_ertm)
+		if (!disable_ertm)
 			feat_mask |= L2CAP_FEAT_ERTM | L2CAP_FEAT_STREAMING
 							 | L2CAP_FEAT_FCS;
 		put_unaligned_le32(feat_mask, rsp->data);
@@ -3231,6 +3364,15 @@ static inline int l2cap_information_rsp(struct l2cap_conn *conn, struct l2cap_cm
 
 	del_timer(&conn->info_timer);
 
+	if (result != L2CAP_IR_SUCCESS) {
+		conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
+		conn->info_ident = 0;
+
+		l2cap_conn_start(conn);
+
+		return 0;
+	}
+
 	if (type == L2CAP_IT_FEAT_MASK) {
 		conn->feat_mask = get_unaligned_le32(rsp->data);
 
@@ -3359,7 +3501,7 @@ static int l2cap_check_fcs(struct l2cap_pinfo *pi,  struct sk_buff *skb)
 		our_fcs = crc16(0, skb->data - hdr_size, skb->len + hdr_size);
 
 		if (our_fcs != rcv_fcs)
-			return -EINVAL;
+			return -EBADMSG;
 	}
 	return 0;
 }
@@ -3370,25 +3512,19 @@ static inline void l2cap_send_i_or_rr_or_rnr(struct sock *sk)
 	u16 control = 0;
 
 	pi->frames_sent = 0;
-	pi->conn_state |= L2CAP_CONN_SEND_FBIT;
 
 	control |= pi->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT;
 
 	if (pi->conn_state & L2CAP_CONN_LOCAL_BUSY) {
-		control |= L2CAP_SUPER_RCV_NOT_READY | L2CAP_CTRL_FINAL;
+		control |= L2CAP_SUPER_RCV_NOT_READY;
 		l2cap_send_sframe(pi, control);
 		pi->conn_state |= L2CAP_CONN_RNR_SENT;
-		pi->conn_state &= ~L2CAP_CONN_SEND_FBIT;
 	}
 
-	if (pi->conn_state & L2CAP_CONN_REMOTE_BUSY && pi->unacked_frames > 0)
-		__mod_retrans_timer();
-
-	pi->conn_state &= ~L2CAP_CONN_REMOTE_BUSY;
+	if (pi->conn_state & L2CAP_CONN_REMOTE_BUSY)
+		l2cap_retransmit_frames(sk);
 
-	spin_lock_bh(&pi->send_lock);
 	l2cap_ertm_send(sk);
-	spin_unlock_bh(&pi->send_lock);
 
 	if (!(pi->conn_state & L2CAP_CONN_LOCAL_BUSY) &&
 			pi->frames_sent == 0) {
@@ -3400,6 +3536,8 @@ static inline void l2cap_send_i_or_rr_or_rnr(struct sock *sk)
 static int l2cap_add_to_srej_queue(struct sock *sk, struct sk_buff *skb, u8 tx_seq, u8 sar)
 {
 	struct sk_buff *next_skb;
+	struct l2cap_pinfo *pi = l2cap_pi(sk);
+	int tx_seq_offset, next_tx_seq_offset;
 
 	bt_cb(skb)->tx_seq = tx_seq;
 	bt_cb(skb)->sar = sar;
@@ -3410,11 +3548,20 @@ static int l2cap_add_to_srej_queue(struct sock *sk, struct sk_buff *skb, u8 tx_s
 		return 0;
 	}
 
+	tx_seq_offset = (tx_seq - pi->buffer_seq) % 64;
+	if (tx_seq_offset < 0)
+		tx_seq_offset += 64;
+
 	do {
 		if (bt_cb(next_skb)->tx_seq == tx_seq)
 			return -EINVAL;
 
-		if (bt_cb(next_skb)->tx_seq > tx_seq) {
+		next_tx_seq_offset = (bt_cb(next_skb)->tx_seq -
+						pi->buffer_seq) % 64;
+		if (next_tx_seq_offset < 0)
+			next_tx_seq_offset += 64;
+
+		if (next_tx_seq_offset > tx_seq_offset) {
 			__skb_queue_before(SREJ_QUEUE(sk), next_skb, skb);
 			return 0;
 		}
@@ -3532,11 +3679,51 @@ drop:
 	pi->sdu = NULL;
 
 disconnect:
-	l2cap_send_disconn_req(pi->conn, sk);
+	l2cap_send_disconn_req(pi->conn, sk, ECONNRESET);
 	kfree_skb(skb);
 	return 0;
 }
 
+static int l2cap_try_push_rx_skb(struct sock *sk)
+{
+	struct l2cap_pinfo *pi = l2cap_pi(sk);
+	struct sk_buff *skb;
+	u16 control;
+	int err;
+
+	while ((skb = skb_dequeue(BUSY_QUEUE(sk)))) {
+		control = bt_cb(skb)->sar << L2CAP_CTRL_SAR_SHIFT;
+		err = l2cap_ertm_reassembly_sdu(sk, skb, control);
+		if (err < 0) {
+			skb_queue_head(BUSY_QUEUE(sk), skb);
+			return -EBUSY;
+		}
+
+		pi->buffer_seq = (pi->buffer_seq + 1) % 64;
+	}
+
+	if (!(pi->conn_state & L2CAP_CONN_RNR_SENT))
+		goto done;
+
+	control = pi->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT;
+	control |= L2CAP_SUPER_RCV_READY | L2CAP_CTRL_POLL;
+	l2cap_send_sframe(pi, control);
+	l2cap_pi(sk)->retry_count = 1;
+
+	del_timer(&pi->retrans_timer);
+	__mod_monitor_timer();
+
+	l2cap_pi(sk)->conn_state |= L2CAP_CONN_WAIT_F;
+
+done:
+	pi->conn_state &= ~L2CAP_CONN_LOCAL_BUSY;
+	pi->conn_state &= ~L2CAP_CONN_RNR_SENT;
+
+	BT_DBG("sk %p, Exit local busy", sk);
+
+	return 0;
+}
+
 static void l2cap_busy_work(struct work_struct *work)
 {
 	DECLARE_WAITQUEUE(wait, current);
@@ -3545,7 +3732,6 @@ static void l2cap_busy_work(struct work_struct *work)
 	struct sock *sk = (struct sock *)pi;
 	int n_tries = 0, timeo = HZ/5, err;
 	struct sk_buff *skb;
-	u16 control;
 
 	lock_sock(sk);
 
@@ -3555,8 +3741,8 @@ static void l2cap_busy_work(struct work_struct *work)
 
 		if (n_tries++ > L2CAP_LOCAL_BUSY_TRIES) {
 			err = -EBUSY;
-			l2cap_send_disconn_req(pi->conn, sk);
-			goto done;
+			l2cap_send_disconn_req(pi->conn, sk, EBUSY);
+			break;
 		}
 
 		if (!timeo)
@@ -3564,7 +3750,7 @@ static void l2cap_busy_work(struct work_struct *work)
 
 		if (signal_pending(current)) {
 			err = sock_intr_errno(timeo);
-			goto done;
+			break;
 		}
 
 		release_sock(sk);
@@ -3573,40 +3759,12 @@ static void l2cap_busy_work(struct work_struct *work)
 
 		err = sock_error(sk);
 		if (err)
-			goto done;
-
-		while ((skb = skb_dequeue(BUSY_QUEUE(sk)))) {
-			control = bt_cb(skb)->sar << L2CAP_CTRL_SAR_SHIFT;
-			err = l2cap_ertm_reassembly_sdu(sk, skb, control);
-			if (err < 0) {
-				skb_queue_head(BUSY_QUEUE(sk), skb);
-				break;
-			}
-
-			pi->buffer_seq = (pi->buffer_seq + 1) % 64;
-		}
+			break;
 
-		if (!skb)
+		if (l2cap_try_push_rx_skb(sk) == 0)
 			break;
 	}
 
-	if (!(pi->conn_state & L2CAP_CONN_RNR_SENT))
-		goto done;
-
-	control = pi->buffer_seq << L2CAP_CTRL_REQSEQ_SHIFT;
-	control |= L2CAP_SUPER_RCV_READY | L2CAP_CTRL_POLL;
-	l2cap_send_sframe(pi, control);
-	l2cap_pi(sk)->retry_count = 1;
-
-	del_timer(&pi->retrans_timer);
-	__mod_monitor_timer();
-
-	l2cap_pi(sk)->conn_state |= L2CAP_CONN_WAIT_F;
-
-done:
-	pi->conn_state &= ~L2CAP_CONN_LOCAL_BUSY;
-	pi->conn_state &= ~L2CAP_CONN_RNR_SENT;
-
 	set_current_state(TASK_RUNNING);
 	remove_wait_queue(sk_sleep(sk), &wait);
 
@@ -3621,7 +3779,9 @@ static int l2cap_push_rx_skb(struct sock *sk, struct sk_buff *skb, u16 control)
 	if (pi->conn_state & L2CAP_CONN_LOCAL_BUSY) {
 		bt_cb(skb)->sar = control >> L2CAP_CTRL_SAR_SHIFT;
 		__skb_queue_tail(BUSY_QUEUE(sk), skb);
-		return -EBUSY;
+		return l2cap_try_push_rx_skb(sk);
+
+
 	}
 
 	err = l2cap_ertm_reassembly_sdu(sk, skb, control);
@@ -3631,6 +3791,8 @@ static int l2cap_push_rx_skb(struct sock *sk, struct sk_buff *skb, u16 control)
 	}
 
 	/* Busy Condition */
+	BT_DBG("sk %p, Enter local busy", sk);
+
 	pi->conn_state |= L2CAP_CONN_LOCAL_BUSY;
 	bt_cb(skb)->sar = control >> L2CAP_CTRL_SAR_SHIFT;
 	__skb_queue_tail(BUSY_QUEUE(sk), skb);
@@ -3641,6 +3803,8 @@ static int l2cap_push_rx_skb(struct sock *sk, struct sk_buff *skb, u16 control)
 
 	pi->conn_state |= L2CAP_CONN_RNR_SENT;
 
+	del_timer(&pi->ack_timer);
+
 	queue_work(_busy_wq, &pi->busy_work);
 
 	return err;
@@ -3754,7 +3918,7 @@ static void l2cap_check_srej_gap(struct sock *sk, u8 tx_seq)
 		l2cap_ertm_reassembly_sdu(sk, skb, control);
 		l2cap_pi(sk)->buffer_seq_srej =
 			(l2cap_pi(sk)->buffer_seq_srej + 1) % 64;
-		tx_seq++;
+		tx_seq = (tx_seq + 1) % 64;
 	}
 }
 
@@ -3790,10 +3954,11 @@ static void l2cap_send_srejframe(struct sock *sk, u8 tx_seq)
 		l2cap_send_sframe(pi, control);
 
 		new = kzalloc(sizeof(struct srej_list), GFP_ATOMIC);
-		new->tx_seq = pi->expected_tx_seq++;
+		new->tx_seq = pi->expected_tx_seq;
+		pi->expected_tx_seq = (pi->expected_tx_seq + 1) % 64;
 		list_add_tail(&new->list, SREJ_LIST(sk));
 	}
-	pi->expected_tx_seq++;
+	pi->expected_tx_seq = (pi->expected_tx_seq + 1) % 64;
 }
 
 static inline int l2cap_data_channel_iframe(struct sock *sk, u16 rx_control, struct sk_buff *skb)
@@ -3802,11 +3967,12 @@ static inline int l2cap_data_channel_iframe(struct sock *sk, u16 rx_control, str
 	u8 tx_seq = __get_txseq(rx_control);
 	u8 req_seq = __get_reqseq(rx_control);
 	u8 sar = rx_control >> L2CAP_CTRL_SAR_SHIFT;
-	u8 tx_seq_offset, expected_tx_seq_offset;
+	int tx_seq_offset, expected_tx_seq_offset;
 	int num_to_ack = (pi->tx_win/6) + 1;
 	int err = 0;
 
-	BT_DBG("sk %p rx_control 0x%4.4x len %d", sk, rx_control, skb->len);
+	BT_DBG("sk %p len %d tx_seq %d rx_control 0x%4.4x", sk, skb->len, tx_seq,
+								rx_control);
 
 	if (L2CAP_CTRL_FINAL & rx_control &&
 			l2cap_pi(sk)->conn_state & L2CAP_CONN_WAIT_F) {
@@ -3828,7 +3994,7 @@ static inline int l2cap_data_channel_iframe(struct sock *sk, u16 rx_control, str
 
 	/* invalid tx_seq */
 	if (tx_seq_offset >= pi->tx_win) {
-		l2cap_send_disconn_req(pi->conn, sk);
+		l2cap_send_disconn_req(pi->conn, sk, ECONNRESET);
 		goto drop;
 	}
 
@@ -3851,6 +4017,7 @@ static inline int l2cap_data_channel_iframe(struct sock *sk, u16 rx_control, str
 				pi->buffer_seq = pi->buffer_seq_srej;
 				pi->conn_state &= ~L2CAP_CONN_SREJ_SENT;
 				l2cap_send_ack(pi);
+				BT_DBG("sk %p, Exit SREJ_SENT", sk);
 			}
 		} else {
 			struct srej_list *l;
@@ -3879,6 +4046,8 @@ static inline int l2cap_data_channel_iframe(struct sock *sk, u16 rx_control, str
 
 		pi->conn_state |= L2CAP_CONN_SREJ_SENT;
 
+		BT_DBG("sk %p, Enter SREJ", sk);
+
 		INIT_LIST_HEAD(SREJ_LIST(sk));
 		pi->buffer_seq_srej = pi->buffer_seq;
 
@@ -3889,6 +4058,8 @@ static inline int l2cap_data_channel_iframe(struct sock *sk, u16 rx_control, str
 		pi->conn_state |= L2CAP_CONN_SEND_PBIT;
 
 		l2cap_send_srejframe(sk, tx_seq);
+
+		del_timer(&pi->ack_timer);
 	}
 	return 0;
 
@@ -3902,6 +4073,10 @@ expected:
 		return 0;
 	}
 
+	err = l2cap_push_rx_skb(sk, skb, rx_control);
+	if (err < 0)
+		return 0;
+
 	if (rx_control & L2CAP_CTRL_FINAL) {
 		if (pi->conn_state & L2CAP_CONN_REJ_ACT)
 			pi->conn_state &= ~L2CAP_CONN_REJ_ACT;
@@ -3909,10 +4084,6 @@ expected:
 			l2cap_retransmit_frames(sk);
 	}
 
-	err = l2cap_push_rx_skb(sk, skb, rx_control);
-	if (err < 0)
-		return 0;
-
 	__mod_ack_timer();
 
 	pi->num_acked = (pi->num_acked + 1) % num_to_ack;
@@ -3930,10 +4101,14 @@ static inline void l2cap_data_channel_rrframe(struct sock *sk, u16 rx_control)
 {
 	struct l2cap_pinfo *pi = l2cap_pi(sk);
 
+	BT_DBG("sk %p, req_seq %d ctrl 0x%4.4x", sk, __get_reqseq(rx_control),
+						rx_control);
+
 	pi->expected_ack_seq = __get_reqseq(rx_control);
 	l2cap_drop_acked_frames(sk);
 
 	if (rx_control & L2CAP_CTRL_POLL) {
+		pi->conn_state |= L2CAP_CONN_SEND_FBIT;
 		if (pi->conn_state & L2CAP_CONN_SREJ_SENT) {
 			if ((pi->conn_state & L2CAP_CONN_REMOTE_BUSY) &&
 					(pi->unacked_frames > 0))
@@ -3962,9 +4137,7 @@ static inline void l2cap_data_channel_rrframe(struct sock *sk, u16 rx_control)
 		if (pi->conn_state & L2CAP_CONN_SREJ_SENT) {
 			l2cap_send_ack(pi);
 		} else {
-			spin_lock_bh(&pi->send_lock);
 			l2cap_ertm_send(sk);
-			spin_unlock_bh(&pi->send_lock);
 		}
 	}
 }
@@ -3974,6 +4147,8 @@ static inline void l2cap_data_channel_rejframe(struct sock *sk, u16 rx_control)
 	struct l2cap_pinfo *pi = l2cap_pi(sk);
 	u8 tx_seq = __get_reqseq(rx_control);
 
+	BT_DBG("sk %p, req_seq %d ctrl 0x%4.4x", sk, tx_seq, rx_control);
+
 	pi->conn_state &= ~L2CAP_CONN_REMOTE_BUSY;
 
 	pi->expected_ack_seq = tx_seq;
@@ -3996,16 +4171,18 @@ static inline void l2cap_data_channel_srejframe(struct sock *sk, u16 rx_control)
 	struct l2cap_pinfo *pi = l2cap_pi(sk);
 	u8 tx_seq = __get_reqseq(rx_control);
 
+	BT_DBG("sk %p, req_seq %d ctrl 0x%4.4x", sk, tx_seq, rx_control);
+
 	pi->conn_state &= ~L2CAP_CONN_REMOTE_BUSY;
 
 	if (rx_control & L2CAP_CTRL_POLL) {
 		pi->expected_ack_seq = tx_seq;
 		l2cap_drop_acked_frames(sk);
+
+		pi->conn_state |= L2CAP_CONN_SEND_FBIT;
 		l2cap_retransmit_one_frame(sk, tx_seq);
 
-		spin_lock_bh(&pi->send_lock);
 		l2cap_ertm_send(sk);
-		spin_unlock_bh(&pi->send_lock);
 
 		if (pi->conn_state & L2CAP_CONN_WAIT_F) {
 			pi->srej_save_reqseq = tx_seq;
@@ -4031,10 +4208,15 @@ static inline void l2cap_data_channel_rnrframe(struct sock *sk, u16 rx_control)
 	struct l2cap_pinfo *pi = l2cap_pi(sk);
 	u8 tx_seq = __get_reqseq(rx_control);
 
+	BT_DBG("sk %p, req_seq %d ctrl 0x%4.4x", sk, tx_seq, rx_control);
+
 	pi->conn_state |= L2CAP_CONN_REMOTE_BUSY;
 	pi->expected_ack_seq = tx_seq;
 	l2cap_drop_acked_frames(sk);
 
+	if (rx_control & L2CAP_CTRL_POLL)
+		pi->conn_state |= L2CAP_CONN_SEND_FBIT;
+
 	if (!(pi->conn_state & L2CAP_CONN_SREJ_SENT)) {
 		del_timer(&pi->retrans_timer);
 		if (rx_control & L2CAP_CTRL_POLL)
@@ -4082,12 +4264,83 @@ static inline int l2cap_data_channel_sframe(struct sock *sk, u16 rx_control, str
 	return 0;
 }
 
+static int l2cap_ertm_data_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct l2cap_pinfo *pi = l2cap_pi(sk);
+	u16 control;
+	u8 req_seq;
+	int len, next_tx_seq_offset, req_seq_offset;
+
+	control = get_unaligned_le16(skb->data);
+	skb_pull(skb, 2);
+	len = skb->len;
+
+	/*
+	 * We can just drop the corrupted I-frame here.
+	 * Receiver will miss it and start proper recovery
+	 * procedures and ask retransmission.
+	 */
+	if (l2cap_check_fcs(pi, skb))
+		goto drop;
+
+	if (__is_sar_start(control) && __is_iframe(control))
+		len -= 2;
+
+	if (pi->fcs == L2CAP_FCS_CRC16)
+		len -= 2;
+
+	if (len > pi->mps) {
+		l2cap_send_disconn_req(pi->conn, sk, ECONNRESET);
+		goto drop;
+	}
+
+	req_seq = __get_reqseq(control);
+	req_seq_offset = (req_seq - pi->expected_ack_seq) % 64;
+	if (req_seq_offset < 0)
+		req_seq_offset += 64;
+
+	next_tx_seq_offset =
+		(pi->next_tx_seq - pi->expected_ack_seq) % 64;
+	if (next_tx_seq_offset < 0)
+		next_tx_seq_offset += 64;
+
+	/* check for invalid req-seq */
+	if (req_seq_offset > next_tx_seq_offset) {
+		l2cap_send_disconn_req(pi->conn, sk, ECONNRESET);
+		goto drop;
+	}
+
+	if (__is_iframe(control)) {
+		if (len < 0) {
+			l2cap_send_disconn_req(pi->conn, sk, ECONNRESET);
+			goto drop;
+		}
+
+		l2cap_data_channel_iframe(sk, control, skb);
+	} else {
+		if (len != 0) {
+			BT_ERR("%d", len);
+			l2cap_send_disconn_req(pi->conn, sk, ECONNRESET);
+			goto drop;
+		}
+
+		l2cap_data_channel_sframe(sk, control, skb);
+	}
+
+	return 0;
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
 static inline int l2cap_data_channel(struct l2cap_conn *conn, u16 cid, struct sk_buff *skb)
 {
 	struct sock *sk;
 	struct l2cap_pinfo *pi;
-	u16 control, len;
-	u8 tx_seq, req_seq, next_tx_seq_offset, req_seq_offset;
+	u16 control;
+	u8 tx_seq;
+	int len;
 
 	sk = l2cap_get_chan_by_scid(&conn->chan_list, cid);
 	if (!sk) {
@@ -4117,59 +4370,11 @@ static inline int l2cap_data_channel(struct l2cap_conn *conn, u16 cid, struct sk
 		break;
 
 	case L2CAP_MODE_ERTM:
-		control = get_unaligned_le16(skb->data);
-		skb_pull(skb, 2);
-		len = skb->len;
-
-		if (__is_sar_start(control))
-			len -= 2;
-
-		if (pi->fcs == L2CAP_FCS_CRC16)
-			len -= 2;
-
-		/*
-		 * We can just drop the corrupted I-frame here.
-		 * Receiver will miss it and start proper recovery
-		 * procedures and ask retransmission.
-		 */
-		if (len > pi->mps) {
-			l2cap_send_disconn_req(pi->conn, sk);
-			goto drop;
-		}
-
-		if (l2cap_check_fcs(pi, skb))
-			goto drop;
-
-		req_seq = __get_reqseq(control);
-		req_seq_offset = (req_seq - pi->expected_ack_seq) % 64;
-		if (req_seq_offset < 0)
-			req_seq_offset += 64;
-
-		next_tx_seq_offset =
-			(pi->next_tx_seq - pi->expected_ack_seq) % 64;
-		if (next_tx_seq_offset < 0)
-			next_tx_seq_offset += 64;
-
-		/* check for invalid req-seq */
-		if (req_seq_offset > next_tx_seq_offset) {
-			l2cap_send_disconn_req(pi->conn, sk);
-			goto drop;
-		}
-
-		if (__is_iframe(control)) {
-			if (len < 4) {
-				l2cap_send_disconn_req(pi->conn, sk);
-				goto drop;
-			}
-
-			l2cap_data_channel_iframe(sk, control, skb);
+		if (!sock_owned_by_user(sk)) {
+			l2cap_ertm_data_rcv(sk, skb);
 		} else {
-			if (len != 0) {
-				l2cap_send_disconn_req(pi->conn, sk);
+			if (sk_add_backlog(sk, skb))
 				goto drop;
-			}
-
-			l2cap_data_channel_sframe(sk, control, skb);
 		}
 
 		goto done;
@@ -4179,16 +4384,16 @@ static inline int l2cap_data_channel(struct l2cap_conn *conn, u16 cid, struct sk
 		skb_pull(skb, 2);
 		len = skb->len;
 
+		if (l2cap_check_fcs(pi, skb))
+			goto drop;
+
 		if (__is_sar_start(control))
 			len -= 2;
 
 		if (pi->fcs == L2CAP_FCS_CRC16)
 			len -= 2;
 
-		if (len > pi->mps || len < 4 || __is_sframe(control))
-			goto drop;
-
-		if (l2cap_check_fcs(pi, skb))
+		if (len > pi->mps || len < 0 || __is_sframe(control))
 			goto drop;
 
 		tx_seq = __get_txseq(control);
@@ -4288,7 +4493,7 @@ static int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type)
 	struct hlist_node *node;
 
 	if (type != ACL_LINK)
-		return 0;
+		return -EINVAL;
 
 	BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr));
 
@@ -4321,7 +4526,7 @@ static int l2cap_connect_cfm(struct hci_conn *hcon, u8 status)
 	BT_DBG("hcon %p bdaddr %s status %d", hcon, batostr(&hcon->dst), status);
 
 	if (hcon->type != ACL_LINK)
-		return 0;
+		return -EINVAL;
 
 	if (!status) {
 		conn = l2cap_conn_add(hcon, status);
@@ -4350,7 +4555,7 @@ static int l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason)
 	BT_DBG("hcon %p reason %d", hcon, reason);
 
 	if (hcon->type != ACL_LINK)
-		return 0;
+		return -EINVAL;
 
 	l2cap_conn_del(hcon, bt_err(reason));
 
@@ -4459,6 +4664,8 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
 
 	if (flags & ACL_START) {
 		struct l2cap_hdr *hdr;
+		struct sock *sk;
+		u16 cid;
 		int len;
 
 		if (conn->rx_len) {
@@ -4469,7 +4676,8 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
 			l2cap_conn_unreliable(conn, ECOMM);
 		}
 
-		if (skb->len < 2) {
+		/* Start fragment always begin with Basic L2CAP header */
+		if (skb->len < L2CAP_HDR_SIZE) {
 			BT_ERR("Frame is too short (len %d)", skb->len);
 			l2cap_conn_unreliable(conn, ECOMM);
 			goto drop;
@@ -4477,6 +4685,7 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
 
 		hdr = (struct l2cap_hdr *) skb->data;
 		len = __le16_to_cpu(hdr->len) + L2CAP_HDR_SIZE;
+		cid = __le16_to_cpu(hdr->cid);
 
 		if (len == skb->len) {
 			/* Complete frame received */
@@ -4493,6 +4702,19 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
 			goto drop;
 		}
 
+		sk = l2cap_get_chan_by_scid(&conn->chan_list, cid);
+
+		if (sk && l2cap_pi(sk)->imtu < len - L2CAP_HDR_SIZE) {
+			BT_ERR("Frame exceeding recv MTU (len %d, MTU %d)",
+					len, l2cap_pi(sk)->imtu);
+			bh_unlock_sock(sk);
+			l2cap_conn_unreliable(conn, ECOMM);
+			goto drop;
+		}
+
+		if (sk)
+			bh_unlock_sock(sk);
+
 		/* Allocate skb for the complete frame (with header) */
 		conn->rx_skb = bt_skb_alloc(len, GFP_ATOMIC);
 		if (!conn->rx_skb)
@@ -4679,14 +4901,8 @@ EXPORT_SYMBOL(l2cap_load);
 module_init(l2cap_init);
 module_exit(l2cap_exit);
 
-module_param(enable_ertm, bool, 0644);
-MODULE_PARM_DESC(enable_ertm, "Enable enhanced retransmission mode");
-
-module_param(max_transmit, uint, 0644);
-MODULE_PARM_DESC(max_transmit, "Max transmit value (default = 3)");
-
-module_param(tx_window, uint, 0644);
-MODULE_PARM_DESC(tx_window, "Transmission window size value (default = 63)");
+module_param(disable_ertm, bool, 0644);
+MODULE_PARM_DESC(disable_ertm, "Disable enhanced retransmission mode");
 
 MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
 MODULE_DESCRIPTION("Bluetooth L2CAP ver " VERSION);
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
index ad2af5814e4..b826d1bf10d 100644
--- a/net/bluetooth/lib.c
+++ b/net/bluetooth/lib.c
@@ -51,8 +51,8 @@ char *batostr(bdaddr_t *ba)
 
 	i ^= 1;
 	sprintf(str[i], "%2.2X:%2.2X:%2.2X:%2.2X:%2.2X:%2.2X",
-		ba->b[0], ba->b[1], ba->b[2],
-		ba->b[3], ba->b[4], ba->b[5]);
+		ba->b[5], ba->b[4], ba->b[3],
+		ba->b[2], ba->b[1], ba->b[0]);
 
 	return str[i];
 }
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 7dca91bb8c5..39a5d87e33b 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -113,11 +113,10 @@ static void rfcomm_session_del(struct rfcomm_session *s);
 #define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1)
 #define __get_rpn_parity(line)    (((line) >> 3) & 0x7)
 
-static inline void rfcomm_schedule(uint event)
+static inline void rfcomm_schedule(void)
 {
 	if (!rfcomm_thread)
 		return;
-	//set_bit(event, &rfcomm_event);
 	set_bit(RFCOMM_SCHED_WAKEUP, &rfcomm_event);
 	wake_up_process(rfcomm_thread);
 }
@@ -179,13 +178,13 @@ static unsigned char rfcomm_crc_table[256] = {
 /* FCS on 2 bytes */
 static inline u8 __fcs(u8 *data)
 {
-	return (0xff - __crc(data));
+	return 0xff - __crc(data);
 }
 
 /* FCS on 3 bytes */
 static inline u8 __fcs2(u8 *data)
 {
-	return (0xff - rfcomm_crc_table[__crc(data) ^ data[2]]);
+	return 0xff - rfcomm_crc_table[__crc(data) ^ data[2]];
 }
 
 /* Check FCS */
@@ -203,13 +202,13 @@ static inline int __check_fcs(u8 *data, int type, u8 fcs)
 static void rfcomm_l2state_change(struct sock *sk)
 {
 	BT_DBG("%p state %d", sk, sk->sk_state);
-	rfcomm_schedule(RFCOMM_SCHED_STATE);
+	rfcomm_schedule();
 }
 
 static void rfcomm_l2data_ready(struct sock *sk, int bytes)
 {
 	BT_DBG("%p bytes %d", sk, bytes);
-	rfcomm_schedule(RFCOMM_SCHED_RX);
+	rfcomm_schedule();
 }
 
 static int rfcomm_l2sock_create(struct socket **sock)
@@ -255,7 +254,7 @@ static void rfcomm_session_timeout(unsigned long arg)
 	BT_DBG("session %p state %ld", s, s->state);
 
 	set_bit(RFCOMM_TIMED_OUT, &s->flags);
-	rfcomm_schedule(RFCOMM_SCHED_TIMEO);
+	rfcomm_schedule();
 }
 
 static void rfcomm_session_set_timer(struct rfcomm_session *s, long timeout)
@@ -283,7 +282,7 @@ static void rfcomm_dlc_timeout(unsigned long arg)
 
 	set_bit(RFCOMM_TIMED_OUT, &d->flags);
 	rfcomm_dlc_put(d);
-	rfcomm_schedule(RFCOMM_SCHED_TIMEO);
+	rfcomm_schedule();
 }
 
 static void rfcomm_dlc_set_timer(struct rfcomm_dlc *d, long timeout)
@@ -465,7 +464,7 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
 	case BT_CONFIG:
 		if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
 			set_bit(RFCOMM_AUTH_REJECT, &d->flags);
-			rfcomm_schedule(RFCOMM_SCHED_AUTH);
+			rfcomm_schedule();
 			break;
 		}
 		/* Fall through */
@@ -485,7 +484,7 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
 	case BT_CONNECT2:
 		if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
 			set_bit(RFCOMM_AUTH_REJECT, &d->flags);
-			rfcomm_schedule(RFCOMM_SCHED_AUTH);
+			rfcomm_schedule();
 			break;
 		}
 		/* Fall through */
@@ -533,7 +532,7 @@ int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb)
 	skb_queue_tail(&d->tx_queue, skb);
 
 	if (!test_bit(RFCOMM_TX_THROTTLED, &d->flags))
-		rfcomm_schedule(RFCOMM_SCHED_TX);
+		rfcomm_schedule();
 	return len;
 }
 
@@ -545,7 +544,7 @@ void __rfcomm_dlc_throttle(struct rfcomm_dlc *d)
 		d->v24_sig |= RFCOMM_V24_FC;
 		set_bit(RFCOMM_MSC_PENDING, &d->flags);
 	}
-	rfcomm_schedule(RFCOMM_SCHED_TX);
+	rfcomm_schedule();
 }
 
 void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d)
@@ -556,7 +555,7 @@ void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d)
 		d->v24_sig &= ~RFCOMM_V24_FC;
 		set_bit(RFCOMM_MSC_PENDING, &d->flags);
 	}
-	rfcomm_schedule(RFCOMM_SCHED_TX);
+	rfcomm_schedule();
 }
 
 /*
@@ -577,7 +576,7 @@ int rfcomm_dlc_set_modem_status(struct rfcomm_dlc *d, u8 v24_sig)
 	d->v24_sig = v24_sig;
 
 	if (!test_and_set_bit(RFCOMM_MSC_PENDING, &d->flags))
-		rfcomm_schedule(RFCOMM_SCHED_TX);
+		rfcomm_schedule();
 
 	return 0;
 }
@@ -816,7 +815,7 @@ static int rfcomm_queue_disc(struct rfcomm_dlc *d)
 	cmd->fcs  = __fcs2((u8 *) cmd);
 
 	skb_queue_tail(&d->tx_queue, skb);
-	rfcomm_schedule(RFCOMM_SCHED_TX);
+	rfcomm_schedule();
 	return 0;
 }
 
@@ -1415,8 +1414,8 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
 		return 0;
 
 	if (len == 1) {
-		/* This is a request, return default settings */
-		bit_rate  = RFCOMM_RPN_BR_115200;
+		/* This is a request, return default (according to ETSI TS 07.10) settings */
+		bit_rate  = RFCOMM_RPN_BR_9600;
 		data_bits = RFCOMM_RPN_DATA_8;
 		stop_bits = RFCOMM_RPN_STOP_1;
 		parity    = RFCOMM_RPN_PARITY_NONE;
@@ -1431,9 +1430,9 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_
 
 	if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_BITRATE)) {
 		bit_rate = rpn->bit_rate;
-		if (bit_rate != RFCOMM_RPN_BR_115200) {
+		if (bit_rate > RFCOMM_RPN_BR_230400) {
 			BT_DBG("RPN bit rate mismatch 0x%x", bit_rate);
-			bit_rate = RFCOMM_RPN_BR_115200;
+			bit_rate = RFCOMM_RPN_BR_9600;
 			rpn_mask ^= RFCOMM_RPN_PM_BITRATE;
 		}
 	}
@@ -1698,7 +1697,7 @@ static int rfcomm_recv_frame(struct rfcomm_session *s, struct sk_buff *skb)
 		break;
 
 	default:
-		BT_ERR("Unknown packet type 0x%02x\n", type);
+		BT_ERR("Unknown packet type 0x%02x", type);
 		break;
 	}
 	kfree_skb(skb);
@@ -1884,7 +1883,7 @@ static inline void rfcomm_accept_connection(struct rfcomm_session *s)
 		 * L2CAP MTU minus UIH header and FCS. */
 		s->mtu = min(l2cap_pi(nsock->sk)->omtu, l2cap_pi(nsock->sk)->imtu) - 5;
 
-		rfcomm_schedule(RFCOMM_SCHED_RX);
+		rfcomm_schedule();
 	} else
 		sock_release(nsock);
 }
@@ -2093,7 +2092,7 @@ static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
 
 	rfcomm_session_put(s);
 
-	rfcomm_schedule(RFCOMM_SCHED_AUTH);
+	rfcomm_schedule();
 }
 
 static struct hci_cb rfcomm_cb = {
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 43fbf6b4b4b..aec505f934d 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -82,11 +82,14 @@ static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb)
 static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
 {
 	struct sock *sk = d->owner, *parent;
+	unsigned long flags;
+
 	if (!sk)
 		return;
 
 	BT_DBG("dlc %p state %ld err %d", d, d->state, err);
 
+	local_irq_save(flags);
 	bh_lock_sock(sk);
 
 	if (err)
@@ -108,6 +111,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
 	}
 
 	bh_unlock_sock(sk);
+	local_irq_restore(flags);
 
 	if (parent && sock_flag(sk, SOCK_ZAPPED)) {
 		/* We have to drop DLC lock here, otherwise
@@ -617,121 +621,29 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
 	return sent;
 }
 
-static long rfcomm_sock_data_wait(struct sock *sk, long timeo)
-{
-	DECLARE_WAITQUEUE(wait, current);
-
-	add_wait_queue(sk_sleep(sk), &wait);
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-
-		if (!skb_queue_empty(&sk->sk_receive_queue) ||
-		    sk->sk_err ||
-		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
-		    signal_pending(current) ||
-		    !timeo)
-			break;
-
-		set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
-		release_sock(sk);
-		timeo = schedule_timeout(timeo);
-		lock_sock(sk);
-		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
-	}
-
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(sk_sleep(sk), &wait);
-	return timeo;
-}
-
 static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 			       struct msghdr *msg, size_t size, int flags)
 {
 	struct sock *sk = sock->sk;
 	struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
-	int err = 0;
-	size_t target, copied = 0;
-	long timeo;
+	int len;
 
 	if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
 		rfcomm_dlc_accept(d);
 		return 0;
 	}
 
-	if (flags & MSG_OOB)
-		return -EOPNOTSUPP;
-
-	msg->msg_namelen = 0;
-
-	BT_DBG("sk %p size %zu", sk, size);
+	len = bt_sock_stream_recvmsg(iocb, sock, msg, size, flags);
 
 	lock_sock(sk);
+	if (!(flags & MSG_PEEK) && len > 0)
+		atomic_sub(len, &sk->sk_rmem_alloc);
 
-	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
-	timeo  = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
-
-	do {
-		struct sk_buff *skb;
-		int chunk;
-
-		skb = skb_dequeue(&sk->sk_receive_queue);
-		if (!skb) {
-			if (copied >= target)
-				break;
-
-			if ((err = sock_error(sk)) != 0)
-				break;
-			if (sk->sk_shutdown & RCV_SHUTDOWN)
-				break;
-
-			err = -EAGAIN;
-			if (!timeo)
-				break;
-
-			timeo = rfcomm_sock_data_wait(sk, timeo);
-
-			if (signal_pending(current)) {
-				err = sock_intr_errno(timeo);
-				goto out;
-			}
-			continue;
-		}
-
-		chunk = min_t(unsigned int, skb->len, size);
-		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
-			skb_queue_head(&sk->sk_receive_queue, skb);
-			if (!copied)
-				copied = -EFAULT;
-			break;
-		}
-		copied += chunk;
-		size   -= chunk;
-
-		sock_recv_ts_and_drops(msg, sk, skb);
-
-		if (!(flags & MSG_PEEK)) {
-			atomic_sub(chunk, &sk->sk_rmem_alloc);
-
-			skb_pull(skb, chunk);
-			if (skb->len) {
-				skb_queue_head(&sk->sk_receive_queue, skb);
-				break;
-			}
-			kfree_skb(skb);
-
-		} else {
-			/* put message back and return */
-			skb_queue_head(&sk->sk_receive_queue, skb);
-			break;
-		}
-	} while (size);
-
-out:
 	if (atomic_read(&sk->sk_rmem_alloc) <= (sk->sk_rcvbuf >> 2))
 		rfcomm_dlc_unthrottle(rfcomm_pi(sk)->dlc);
-
 	release_sock(sk);
-	return copied ? : err;
+
+	return len;
 }
 
 static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, unsigned int optlen)
@@ -1152,7 +1064,7 @@ error:
 	return err;
 }
 
-void rfcomm_cleanup_sockets(void)
+void __exit rfcomm_cleanup_sockets(void)
 {
 	debugfs_remove(rfcomm_sock_debugfs);
 
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 309b6c261b2..a9b81f5dacd 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -183,9 +183,7 @@ static struct device *rfcomm_get_device(struct rfcomm_dev *dev)
 static ssize_t show_address(struct device *tty_dev, struct device_attribute *attr, char *buf)
 {
 	struct rfcomm_dev *dev = dev_get_drvdata(tty_dev);
-	bdaddr_t bdaddr;
-	baswap(&bdaddr, &dev->dst);
-	return sprintf(buf, "%s\n", batostr(&bdaddr));
+	return sprintf(buf, "%s\n", batostr(&dev->dst));
 }
 
 static ssize_t show_channel(struct device *tty_dev, struct device_attribute *attr, char *buf)
@@ -844,10 +842,6 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, struct file *filp, unsigned
 		BT_DBG("TIOCMIWAIT");
 		break;
 
-	case TIOCGICOUNT:
-		BT_DBG("TIOCGICOUNT");
-		break;
-
 	case TIOCGSERIAL:
 		BT_ERR("TIOCGSERIAL is not supported");
 		return -ENOIOCTLCMD;
@@ -1153,7 +1147,7 @@ static const struct tty_operations rfcomm_ops = {
 	.tiocmset		= rfcomm_tty_tiocmset,
 };
 
-int rfcomm_init_ttys(void)
+int __init rfcomm_init_ttys(void)
 {
 	rfcomm_tty_driver = alloc_tty_driver(RFCOMM_TTY_PORTS);
 	if (!rfcomm_tty_driver)
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 76357b54775..c8436fa3134 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -63,7 +63,6 @@ static int __init br_init(void)
 		goto err_out4;
 
 	brioctl_set(br_ioctl_deviceless_stub);
-	br_handle_frame_hook = br_handle_frame;
 
 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
 	br_fdb_test_addr_hook = br_fdb_test_addr;
@@ -100,7 +99,6 @@ static void __exit br_deinit(void)
 	br_fdb_test_addr_hook = NULL;
 #endif
 
-	br_handle_frame_hook = NULL;
 	br_fdb_fini();
 }
 
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 753fc4221f3..17cb0b63357 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -22,7 +22,7 @@
 #include <asm/uaccess.h>
 #include "br_private.h"
 
-/* net device transmit always called with no BH (preempt_disabled) */
+/* net device transmit always called with BH disabled */
 netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct net_bridge *br = netdev_priv(dev);
@@ -38,17 +38,26 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 #endif
 
+	u64_stats_update_begin(&brstats->syncp);
 	brstats->tx_packets++;
 	brstats->tx_bytes += skb->len;
+	u64_stats_update_end(&brstats->syncp);
 
 	BR_INPUT_SKB_CB(skb)->brdev = dev;
 
 	skb_reset_mac_header(skb);
 	skb_pull(skb, ETH_HLEN);
 
+	rcu_read_lock();
 	if (is_multicast_ether_addr(dest)) {
-		if (br_multicast_rcv(br, NULL, skb))
+		if (unlikely(netpoll_tx_running(dev))) {
+			br_flood_deliver(br, skb);
+			goto out;
+		}
+		if (br_multicast_rcv(br, NULL, skb)) {
+			kfree_skb(skb);
 			goto out;
+		}
 
 		mdst = br_mdb_get(br, skb);
 		if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb))
@@ -61,6 +70,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 		br_flood_deliver(br, skb);
 
 out:
+	rcu_read_unlock();
 	return NETDEV_TX_OK;
 }
 
@@ -92,21 +102,25 @@ static int br_dev_stop(struct net_device *dev)
 	return 0;
 }
 
-static struct net_device_stats *br_get_stats(struct net_device *dev)
+static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev,
+						struct rtnl_link_stats64 *stats)
 {
 	struct net_bridge *br = netdev_priv(dev);
-	struct net_device_stats *stats = &dev->stats;
-	struct br_cpu_netstats sum = { 0 };
+	struct br_cpu_netstats tmp, sum = { 0 };
 	unsigned int cpu;
 
 	for_each_possible_cpu(cpu) {
+		unsigned int start;
 		const struct br_cpu_netstats *bstats
 			= per_cpu_ptr(br->stats, cpu);
-
-		sum.tx_bytes   += bstats->tx_bytes;
-		sum.tx_packets += bstats->tx_packets;
-		sum.rx_bytes   += bstats->rx_bytes;
-		sum.rx_packets += bstats->rx_packets;
+		do {
+			start = u64_stats_fetch_begin(&bstats->syncp);
+			memcpy(&tmp, bstats, sizeof(tmp));
+		} while (u64_stats_fetch_retry(&bstats->syncp, start));
+		sum.tx_bytes   += tmp.tx_bytes;
+		sum.tx_packets += tmp.tx_packets;
+		sum.rx_bytes   += tmp.rx_bytes;
+		sum.rx_packets += tmp.rx_packets;
 	}
 
 	stats->tx_bytes   = sum.tx_bytes;
@@ -127,7 +141,7 @@ static int br_change_mtu(struct net_device *dev, int new_mtu)
 
 #ifdef CONFIG_BRIDGE_NETFILTER
 	/* remember the MTU in the rtable for PMTU */
-	br->fake_rtable.u.dst.metrics[RTAX_MTU - 1] = new_mtu;
+	br->fake_rtable.dst.metrics[RTAX_MTU - 1] = new_mtu;
 #endif
 
 	return 0;
@@ -198,66 +212,87 @@ static int br_set_tx_csum(struct net_device *dev, u32 data)
 	return 0;
 }
 
+static int br_set_flags(struct net_device *netdev, u32 data)
+{
+	return ethtool_op_set_flags(netdev, data, ETH_FLAG_TXVLAN);
+}
+
 #ifdef CONFIG_NET_POLL_CONTROLLER
-static bool br_devices_support_netpoll(struct net_bridge *br)
+static void br_poll_controller(struct net_device *br_dev)
 {
-	struct net_bridge_port *p;
-	bool ret = true;
-	int count = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&br->lock, flags);
-	list_for_each_entry(p, &br->port_list, list) {
-		count++;
-		if ((p->dev->priv_flags & IFF_DISABLE_NETPOLL) ||
-		    !p->dev->netdev_ops->ndo_poll_controller)
-			ret = false;
-	}
-	spin_unlock_irqrestore(&br->lock, flags);
-	return count != 0 && ret;
 }
 
-void br_netpoll_cleanup(struct net_device *dev)
+static void br_netpoll_cleanup(struct net_device *dev)
 {
 	struct net_bridge *br = netdev_priv(dev);
 	struct net_bridge_port *p, *n;
-	const struct net_device_ops *ops;
 
-	br->dev->npinfo = NULL;
 	list_for_each_entry_safe(p, n, &br->port_list, list) {
-		if (p->dev) {
-			ops = p->dev->netdev_ops;
-			if (ops->ndo_netpoll_cleanup)
-				ops->ndo_netpoll_cleanup(p->dev);
-			else
-				p->dev->npinfo = NULL;
-		}
+		br_netpoll_disable(p);
 	}
 }
 
-void br_netpoll_disable(struct net_bridge *br,
-			struct net_device *dev)
+static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni)
 {
-	if (br_devices_support_netpoll(br))
-		br->dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
-	if (dev->netdev_ops->ndo_netpoll_cleanup)
-		dev->netdev_ops->ndo_netpoll_cleanup(dev);
-	else
-		dev->npinfo = NULL;
+	struct net_bridge *br = netdev_priv(dev);
+	struct net_bridge_port *p, *n;
+	int err = 0;
+
+	list_for_each_entry_safe(p, n, &br->port_list, list) {
+		if (!p->dev)
+			continue;
+
+		err = br_netpoll_enable(p);
+		if (err)
+			goto fail;
+	}
+
+out:
+	return err;
+
+fail:
+	br_netpoll_cleanup(dev);
+	goto out;
 }
 
-void br_netpoll_enable(struct net_bridge *br,
-		       struct net_device *dev)
+int br_netpoll_enable(struct net_bridge_port *p)
 {
-	if (br_devices_support_netpoll(br)) {
-		br->dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
-		if (br->dev->npinfo)
-			dev->npinfo = br->dev->npinfo;
-	} else if (!(br->dev->priv_flags & IFF_DISABLE_NETPOLL)) {
-		br->dev->priv_flags |= IFF_DISABLE_NETPOLL;
-		br_info(br,"new device %s does not support netpoll (disabling)",
-			dev->name);
+	struct netpoll *np;
+	int err = 0;
+
+	np = kzalloc(sizeof(*p->np), GFP_KERNEL);
+	err = -ENOMEM;
+	if (!np)
+		goto out;
+
+	np->dev = p->dev;
+
+	err = __netpoll_setup(np);
+	if (err) {
+		kfree(np);
+		goto out;
 	}
+
+	p->np = np;
+
+out:
+	return err;
+}
+
+void br_netpoll_disable(struct net_bridge_port *p)
+{
+	struct netpoll *np = p->np;
+
+	if (!np)
+		return;
+
+	p->np = NULL;
+
+	/* Wait for transmitting packets to finish before freeing. */
+	synchronize_rcu_bh();
+
+	__netpoll_cleanup(np);
+	kfree(np);
 }
 
 #endif
@@ -274,19 +309,22 @@ static const struct ethtool_ops br_ethtool_ops = {
 	.get_ufo	= ethtool_op_get_ufo,
 	.set_ufo	= ethtool_op_set_ufo,
 	.get_flags	= ethtool_op_get_flags,
+	.set_flags	= br_set_flags,
 };
 
 static const struct net_device_ops br_netdev_ops = {
 	.ndo_open		 = br_dev_open,
 	.ndo_stop		 = br_dev_stop,
 	.ndo_start_xmit		 = br_dev_xmit,
-	.ndo_get_stats		 = br_get_stats,
+	.ndo_get_stats64	 = br_get_stats64,
 	.ndo_set_mac_address	 = br_set_mac_address,
 	.ndo_set_multicast_list	 = br_dev_set_multicast_list,
 	.ndo_change_mtu		 = br_change_mtu,
 	.ndo_do_ioctl		 = br_dev_ioctl,
 #ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_netpoll_setup	 = br_netpoll_setup,
 	.ndo_netpoll_cleanup	 = br_netpoll_cleanup,
+	.ndo_poll_controller	 = br_poll_controller,
 #endif
 };
 
@@ -311,5 +349,5 @@ void br_dev_setup(struct net_device *dev)
 
 	dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
 			NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX |
-			NETIF_F_NETNS_LOCAL | NETIF_F_GSO;
+			NETIF_F_NETNS_LOCAL | NETIF_F_GSO | NETIF_F_HW_VLAN_TX;
 }
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index b01dde35a69..90512ccfd3e 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -214,7 +214,7 @@ void br_fdb_delete_by_port(struct net_bridge *br,
 	spin_unlock_bh(&br->hash_lock);
 }
 
-/* No locking or refcounting, assumes caller has no preempt (rcu_read_lock) */
+/* No locking or refcounting, assumes caller has rcu_read_lock */
 struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
 					  const unsigned char *addr)
 {
@@ -240,11 +240,11 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
 	struct net_bridge_fdb_entry *fdb;
 	int ret;
 
-	if (!dev->br_port)
+	if (!br_port_exists(dev))
 		return 0;
 
 	rcu_read_lock();
-	fdb = __br_fdb_get(dev->br_port->br, addr);
+	fdb = __br_fdb_get(br_port_get_rcu(dev)->br, addr);
 	ret = fdb && fdb->dst->dev != dev &&
 		fdb->dst->state == BR_STATE_FORWARDING;
 	rcu_read_unlock();
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 595da45f908..cbfe87f0f34 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -67,6 +67,17 @@ int br_forward_finish(struct sk_buff *skb)
 static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
 {
 	skb->dev = to->dev;
+
+	if (unlikely(netpoll_tx_running(to->dev))) {
+		if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))
+			kfree_skb(skb);
+		else {
+			skb_push(skb, ETH_HLEN);
+			br_netpoll_send_skb(to, skb);
+		}
+		return;
+	}
+
 	NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
 		br_forward_finish);
 }
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 18b245e2c00..89ad25a7620 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -61,30 +61,27 @@ static int port_cost(struct net_device *dev)
 }
 
 
-/*
- * Check for port carrier transistions.
- * Called from work queue to allow for calling functions that
- * might sleep (such as speed check), and to debounce.
- */
+/* Check for port carrier transistions. */
 void br_port_carrier_check(struct net_bridge_port *p)
 {
 	struct net_device *dev = p->dev;
 	struct net_bridge *br = p->br;
 
-	if (netif_carrier_ok(dev))
+	if (netif_running(dev) && netif_carrier_ok(dev))
 		p->path_cost = port_cost(dev);
 
-	if (netif_running(br->dev)) {
-		spin_lock_bh(&br->lock);
-		if (netif_carrier_ok(dev)) {
-			if (p->state == BR_STATE_DISABLED)
-				br_stp_enable_port(p);
-		} else {
-			if (p->state != BR_STATE_DISABLED)
-				br_stp_disable_port(p);
-		}
-		spin_unlock_bh(&br->lock);
+	if (!netif_running(br->dev))
+		return;
+
+	spin_lock_bh(&br->lock);
+	if (netif_running(dev) && netif_carrier_ok(dev)) {
+		if (p->state == BR_STATE_DISABLED)
+			br_stp_enable_port(p);
+	} else {
+		if (p->state != BR_STATE_DISABLED)
+			br_stp_disable_port(p);
 	}
+	spin_unlock_bh(&br->lock);
 }
 
 static void release_nbp(struct kobject *kobj)
@@ -147,14 +144,17 @@ static void del_nbp(struct net_bridge_port *p)
 
 	list_del_rcu(&p->list);
 
-	rcu_assign_pointer(dev->br_port, NULL);
+	dev->priv_flags &= ~IFF_BRIDGE_PORT;
+
+	netdev_rx_handler_unregister(dev);
 
 	br_multicast_del_port(p);
 
 	kobject_uevent(&p->kobj, KOBJ_REMOVE);
 	kobject_del(&p->kobj);
 
-	br_netpoll_disable(br, dev);
+	br_netpoll_disable(p);
+
 	call_rcu(&p->rcu, destroy_nbp_rcu);
 }
 
@@ -167,8 +167,6 @@ static void del_br(struct net_bridge *br, struct list_head *head)
 		del_nbp(p);
 	}
 
-	br_netpoll_cleanup(br->dev);
-
 	del_timer_sync(&br->gc_timer);
 
 	br_sysfs_delbr(br->dev);
@@ -400,7 +398,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 		return -ELOOP;
 
 	/* Device is already being bridged */
-	if (dev->br_port != NULL)
+	if (br_port_exists(dev))
 		return -EBUSY;
 
 	/* No bridging devices that dislike that (e.g. wireless) */
@@ -428,7 +426,15 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	if (err)
 		goto err2;
 
-	rcu_assign_pointer(dev->br_port, p);
+	if (br_netpoll_info(br) && ((err = br_netpoll_enable(p))))
+		goto err3;
+
+	err = netdev_rx_handler_register(dev, br_handle_frame, p);
+	if (err)
+		goto err3;
+
+	dev->priv_flags |= IFF_BRIDGE_PORT;
+
 	dev_disable_lro(dev);
 
 	list_add_rcu(&p->list, &br->port_list);
@@ -448,9 +454,9 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 
 	kobject_uevent(&p->kobj, KOBJ_ADD);
 
-	br_netpoll_enable(br, dev);
-
 	return 0;
+err3:
+	sysfs_remove_link(br->ifobj, p->dev->name);
 err2:
 	br_fdb_delete_by_port(br, p, 1);
 err1:
@@ -467,9 +473,13 @@ put_back:
 /* called with RTNL */
 int br_del_if(struct net_bridge *br, struct net_device *dev)
 {
-	struct net_bridge_port *p = dev->br_port;
+	struct net_bridge_port *p;
+
+	if (!br_port_exists(dev))
+		return -EINVAL;
 
-	if (!p || p->br != br)
+	p = br_port_get(dev);
+	if (p->br != br)
 		return -EINVAL;
 
 	del_nbp(p);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index d36e700f7a2..25207a1f182 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -27,8 +27,10 @@ static int br_pass_frame_up(struct sk_buff *skb)
 	struct net_bridge *br = netdev_priv(brdev);
 	struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats);
 
+	u64_stats_update_begin(&brstats->syncp);
 	brstats->rx_packets++;
 	brstats->rx_bytes += skb->len;
+	u64_stats_update_end(&brstats->syncp);
 
 	indev = skb->dev;
 	skb->dev = brdev;
@@ -37,11 +39,11 @@ static int br_pass_frame_up(struct sk_buff *skb)
 		       netif_receive_skb);
 }
 
-/* note: already called with rcu_read_lock (preempt_disabled) */
+/* note: already called with rcu_read_lock */
 int br_handle_frame_finish(struct sk_buff *skb)
 {
 	const unsigned char *dest = eth_hdr(skb)->h_dest;
-	struct net_bridge_port *p = rcu_dereference(skb->dev->br_port);
+	struct net_bridge_port *p = br_port_get_rcu(skb->dev);
 	struct net_bridge *br;
 	struct net_bridge_fdb_entry *dst;
 	struct net_bridge_mdb_entry *mdst;
@@ -108,13 +110,12 @@ drop:
 	goto out;
 }
 
-/* note: already called with rcu_read_lock (preempt_disabled) */
+/* note: already called with rcu_read_lock */
 static int br_handle_local_finish(struct sk_buff *skb)
 {
-	struct net_bridge_port *p = rcu_dereference(skb->dev->br_port);
+	struct net_bridge_port *p = br_port_get_rcu(skb->dev);
 
-	if (p)
-		br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
+	br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
 	return 0;	 /* process further */
 }
 
@@ -131,15 +132,18 @@ static inline int is_link_local(const unsigned char *dest)
 }
 
 /*
- * Called via br_handle_frame_hook.
  * Return NULL if skb is handled
- * note: already called with rcu_read_lock (preempt_disabled)
+ * note: already called with rcu_read_lock
  */
-struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
+struct sk_buff *br_handle_frame(struct sk_buff *skb)
 {
+	struct net_bridge_port *p;
 	const unsigned char *dest = eth_hdr(skb)->h_dest;
 	int (*rhook)(struct sk_buff *skb);
 
+	if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
+		return skb;
+
 	if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
 		goto drop;
 
@@ -147,13 +151,15 @@ struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
 	if (!skb)
 		return NULL;
 
+	p = br_port_get_rcu(skb->dev);
+
 	if (unlikely(is_link_local(dest))) {
 		/* Pause frames shouldn't be passed up by driver anyway */
 		if (skb->protocol == htons(ETH_P_PAUSE))
 			goto drop;
 
 		/* If STP is turned off, then forward */
-		if (p->br->stp_enabled == BR_NO_STP && dest[5] == 0)
+		if (p->br->stp_enabled == BR_NO_STP)
 			goto forward;
 
 		if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 27ae946363f..eb5b256ffc8 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1435,7 +1435,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 	struct icmp6hdr *icmp6h;
 	u8 nexthdr;
 	unsigned len;
-	unsigned offset;
+	int offset;
 	int err;
 
 	if (!pskb_may_pull(skb, sizeof(*ip6h)))
@@ -1728,13 +1728,9 @@ unlock:
 int br_multicast_toggle(struct net_bridge *br, unsigned long val)
 {
 	struct net_bridge_port *port;
-	int err = -ENOENT;
+	int err = 0;
 
 	spin_lock(&br->multicast_lock);
-	if (!netif_running(br->dev))
-		goto unlock;
-
-	err = 0;
 	if (br->multicast_disabled == !val)
 		goto unlock;
 
@@ -1742,6 +1738,9 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val)
 	if (br->multicast_disabled)
 		goto unlock;
 
+	if (!netif_running(br->dev))
+		goto unlock;
+
 	if (br->mdb) {
 		if (br->mdb->old) {
 			err = -EEXIST;
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 8fb75f89c4a..865fd7634b6 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -55,28 +55,33 @@ static int brnf_call_arptables __read_mostly = 1;
 static int brnf_filter_vlan_tagged __read_mostly = 0;
 static int brnf_filter_pppoe_tagged __read_mostly = 0;
 #else
+#define brnf_call_iptables 1
+#define brnf_call_ip6tables 1
+#define brnf_call_arptables 1
 #define brnf_filter_vlan_tagged 0
 #define brnf_filter_pppoe_tagged 0
 #endif
 
 static inline __be16 vlan_proto(const struct sk_buff *skb)
 {
-	return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
+	if (vlan_tx_tag_present(skb))
+		return skb->protocol;
+	else if (skb->protocol == htons(ETH_P_8021Q))
+		return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
+	else
+		return 0;
 }
 
 #define IS_VLAN_IP(skb) \
-	(skb->protocol == htons(ETH_P_8021Q) && \
-	 vlan_proto(skb) == htons(ETH_P_IP) && 	\
+	(vlan_proto(skb) == htons(ETH_P_IP) && \
 	 brnf_filter_vlan_tagged)
 
 #define IS_VLAN_IPV6(skb) \
-	(skb->protocol == htons(ETH_P_8021Q) && \
-	 vlan_proto(skb) == htons(ETH_P_IPV6) &&\
+	(vlan_proto(skb) == htons(ETH_P_IPV6) && \
 	 brnf_filter_vlan_tagged)
 
 #define IS_VLAN_ARP(skb) \
-	(skb->protocol == htons(ETH_P_8021Q) &&	\
-	 vlan_proto(skb) == htons(ETH_P_ARP) &&	\
+	(vlan_proto(skb) == htons(ETH_P_ARP) &&	\
 	 brnf_filter_vlan_tagged)
 
 static inline __be16 pppoe_proto(const struct sk_buff *skb)
@@ -103,7 +108,6 @@ static struct dst_ops fake_dst_ops = {
 	.family =		AF_INET,
 	.protocol =		cpu_to_be16(ETH_P_IP),
 	.update_pmtu =		fake_update_pmtu,
-	.entries =		ATOMIC_INIT(0),
 };
 
 /*
@@ -117,26 +121,27 @@ void br_netfilter_rtable_init(struct net_bridge *br)
 {
 	struct rtable *rt = &br->fake_rtable;
 
-	atomic_set(&rt->u.dst.__refcnt, 1);
-	rt->u.dst.dev = br->dev;
-	rt->u.dst.path = &rt->u.dst;
-	rt->u.dst.metrics[RTAX_MTU - 1] = 1500;
-	rt->u.dst.flags	= DST_NOXFRM;
-	rt->u.dst.ops = &fake_dst_ops;
+	atomic_set(&rt->dst.__refcnt, 1);
+	rt->dst.dev = br->dev;
+	rt->dst.path = &rt->dst;
+	rt->dst.metrics[RTAX_MTU - 1] = 1500;
+	rt->dst.flags	= DST_NOXFRM;
+	rt->dst.ops = &fake_dst_ops;
 }
 
 static inline struct rtable *bridge_parent_rtable(const struct net_device *dev)
 {
-	struct net_bridge_port *port = rcu_dereference(dev->br_port);
-
-	return port ? &port->br->fake_rtable : NULL;
+	if (!br_port_exists(dev))
+		return NULL;
+	return &br_port_get_rcu(dev)->br->fake_rtable;
 }
 
 static inline struct net_device *bridge_parent(const struct net_device *dev)
 {
-	struct net_bridge_port *port = rcu_dereference(dev->br_port);
+	if (!br_port_exists(dev))
+		return NULL;
 
-	return port ? port->br->dev : NULL;
+	return br_port_get_rcu(dev)->br->dev;
 }
 
 static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
@@ -158,8 +163,8 @@ static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb)
 		if (tmp) {
 			memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info));
 			atomic_set(&tmp->use, 1);
-			nf_bridge_put(nf_bridge);
 		}
+		nf_bridge_put(nf_bridge);
 		nf_bridge = tmp;
 	}
 	return nf_bridge;
@@ -205,6 +210,72 @@ static inline void nf_bridge_update_protocol(struct sk_buff *skb)
 		skb->protocol = htons(ETH_P_PPP_SES);
 }
 
+/* When handing a packet over to the IP layer
+ * check whether we have a skb that is in the
+ * expected format
+ */
+
+static int br_parse_ip_options(struct sk_buff *skb)
+{
+	struct ip_options *opt;
+	struct iphdr *iph;
+	struct net_device *dev = skb->dev;
+	u32 len;
+
+	iph = ip_hdr(skb);
+	opt = &(IPCB(skb)->opt);
+
+	/* Basic sanity checks */
+	if (iph->ihl < 5 || iph->version != 4)
+		goto inhdr_error;
+
+	if (!pskb_may_pull(skb, iph->ihl*4))
+		goto inhdr_error;
+
+	iph = ip_hdr(skb);
+	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+		goto inhdr_error;
+
+	len = ntohs(iph->tot_len);
+	if (skb->len < len) {
+		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
+		goto drop;
+	} else if (len < (iph->ihl*4))
+		goto inhdr_error;
+
+	if (pskb_trim_rcsum(skb, len)) {
+		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+
+	/* Zero out the CB buffer if no options present */
+	if (iph->ihl == 5) {
+		memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+		return 0;
+	}
+
+	opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
+	if (ip_options_compile(dev_net(dev), opt, skb))
+		goto inhdr_error;
+
+	/* Check correct handling of SRR option */
+	if (unlikely(opt->srr)) {
+		struct in_device *in_dev = __in_dev_get_rcu(dev);
+		if (in_dev && !IN_DEV_SOURCE_ROUTE(in_dev))
+			goto drop;
+
+		if (ip_options_rcv_srr(skb))
+			goto drop;
+	}
+
+	return 0;
+
+inhdr_error:
+	IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+drop:
+	return -1;
+}
+
 /* Fill in the header for fragmented IP packets handled by
  * the IPv4 connection tracking code.
  */
@@ -244,8 +315,7 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 		kfree_skb(skb);
 		return 0;
 	}
-	dst_hold(&rt->u.dst);
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set_noref(skb, &rt->dst);
 
 	skb->dev = nf_bridge->physindev;
 	nf_bridge_update_protocol(skb);
@@ -396,8 +466,7 @@ bridged_dnat:
 			kfree_skb(skb);
 			return 0;
 		}
-		dst_hold(&rt->u.dst);
-		skb_dst_set(skb, &rt->u.dst);
+		skb_dst_set_noref(skb, &rt->dst);
 	}
 
 	skb->dev = nf_bridge->physindev;
@@ -545,25 +614,29 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
 				      const struct net_device *out,
 				      int (*okfn)(struct sk_buff *))
 {
-	struct iphdr *iph;
+	struct net_bridge_port *p;
+	struct net_bridge *br;
 	__u32 len = nf_bridge_encap_header_len(skb);
 
 	if (unlikely(!pskb_may_pull(skb, len)))
 		goto out;
 
+	p = br_port_get_rcu(in);
+	if (p == NULL)
+		goto out;
+	br = p->br;
+
 	if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) ||
 	    IS_PPPOE_IPV6(skb)) {
-#ifdef CONFIG_SYSCTL
-		if (!brnf_call_ip6tables)
+		if (!brnf_call_ip6tables && !br->nf_call_ip6tables)
 			return NF_ACCEPT;
-#endif
+
 		nf_bridge_pull_encap_header_rcsum(skb);
 		return br_nf_pre_routing_ipv6(hook, skb, in, out, okfn);
 	}
-#ifdef CONFIG_SYSCTL
-	if (!brnf_call_iptables)
+
+	if (!brnf_call_iptables && !br->nf_call_iptables)
 		return NF_ACCEPT;
-#endif
 
 	if (skb->protocol != htons(ETH_P_IP) && !IS_VLAN_IP(skb) &&
 	    !IS_PPPOE_IP(skb))
@@ -571,28 +644,9 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
 
 	nf_bridge_pull_encap_header_rcsum(skb);
 
-	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-		goto inhdr_error;
-
-	iph = ip_hdr(skb);
-	if (iph->ihl < 5 || iph->version != 4)
-		goto inhdr_error;
-
-	if (!pskb_may_pull(skb, 4 * iph->ihl))
-		goto inhdr_error;
-
-	iph = ip_hdr(skb);
-	if (ip_fast_csum((__u8 *) iph, iph->ihl) != 0)
-		goto inhdr_error;
-
-	len = ntohs(iph->tot_len);
-	if (skb->len < len || len < 4 * iph->ihl)
-		goto inhdr_error;
-
-	pskb_trim_rcsum(skb, len);
-
-	/* BUG: Should really parse the IP options here. */
-	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+	if (br_parse_ip_options(skb))
+		/* Drop invalid packet */
+		goto out;
 
 	nf_bridge_put(skb->nf_bridge);
 	if (!nf_bridge_alloc(skb))
@@ -607,8 +661,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
 
 	return NF_STOLEN;
 
-inhdr_error:
-//      IP_INC_STATS_BH(IpInHdrErrors);
 out:
 	return NF_DROP;
 }
@@ -719,12 +771,17 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb,
 				      const struct net_device *out,
 				      int (*okfn)(struct sk_buff *))
 {
+	struct net_bridge_port *p;
+	struct net_bridge *br;
 	struct net_device **d = (struct net_device **)(skb->cb);
 
-#ifdef CONFIG_SYSCTL
-	if (!brnf_call_arptables)
+	p = br_port_get_rcu(out);
+	if (p == NULL)
+		return NF_ACCEPT;
+	br = p->br;
+
+	if (!brnf_call_arptables && !br->nf_call_arptables)
 		return NF_ACCEPT;
-#endif
 
 	if (skb->protocol != htons(ETH_P_ARP)) {
 		if (!IS_VLAN_ARP(skb))
@@ -747,12 +804,19 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb,
 #if defined(CONFIG_NF_CONNTRACK_IPV4) || defined(CONFIG_NF_CONNTRACK_IPV4_MODULE)
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
+	int ret;
+
 	if (skb->nfct != NULL && skb->protocol == htons(ETH_P_IP) &&
 	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
-	    !skb_is_gso(skb))
-		return ip_fragment(skb, br_dev_queue_push_xmit);
-	else
-		return br_dev_queue_push_xmit(skb);
+	    !skb_is_gso(skb)) {
+		if (br_parse_ip_options(skb))
+			/* Drop invalid packet */
+			return NF_DROP;
+		ret = ip_fragment(skb, br_dev_queue_push_xmit);
+	} else
+		ret = br_dev_queue_push_xmit(skb);
+
+	return ret;
 }
 #else
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
@@ -940,15 +1004,22 @@ int __init br_netfilter_init(void)
 {
 	int ret;
 
-	ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+	ret = dst_entries_init(&fake_dst_ops);
 	if (ret < 0)
 		return ret;
+
+	ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+	if (ret < 0) {
+		dst_entries_destroy(&fake_dst_ops);
+		return ret;
+	}
 #ifdef CONFIG_SYSCTL
 	brnf_sysctl_header = register_sysctl_paths(brnf_path, brnf_table);
 	if (brnf_sysctl_header == NULL) {
 		printk(KERN_WARNING
 		       "br_netfilter: can't register to sysctl.\n");
 		nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+		dst_entries_destroy(&fake_dst_ops);
 		return -ENOMEM;
 	}
 #endif
@@ -962,4 +1033,5 @@ void br_netfilter_fini(void)
 #ifdef CONFIG_SYSCTL
 	unregister_sysctl_table(brnf_sysctl_header);
 #endif
+	dst_entries_destroy(&fake_dst_ops);
 }
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index fe0a79018ab..4a6a378c84e 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -120,10 +120,11 @@ static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 	idx = 0;
 	for_each_netdev(net, dev) {
 		/* not a bridge port */
-		if (dev->br_port == NULL || idx < cb->args[0])
+		if (!br_port_exists(dev) || idx < cb->args[0])
 			goto skip;
 
-		if (br_fill_ifinfo(skb, dev->br_port, NETLINK_CB(cb->skb).pid,
+		if (br_fill_ifinfo(skb, br_port_get(dev),
+				   NETLINK_CB(cb->skb).pid,
 				   cb->nlh->nlmsg_seq, RTM_NEWLINK,
 				   NLM_F_MULTI) < 0)
 			break;
@@ -168,9 +169,9 @@ static int br_rtm_setlink(struct sk_buff *skb,  struct nlmsghdr *nlh, void *arg)
 	if (!dev)
 		return -ENODEV;
 
-	p = dev->br_port;
-	if (!p)
+	if (!br_port_exists(dev))
 		return -EINVAL;
+	p = br_port_get(dev);
 
 	/* if kernel STP is running, don't allow changes */
 	if (p->br->stp_enabled == BR_KERNEL_STP)
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index 717e1fd6133..404d4e14c6a 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -32,14 +32,15 @@ struct notifier_block br_device_notifier = {
 static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr)
 {
 	struct net_device *dev = ptr;
-	struct net_bridge_port *p = dev->br_port;
+	struct net_bridge_port *p = br_port_get(dev);
 	struct net_bridge *br;
 	int err;
 
 	/* not a port of a bridge */
-	if (p == NULL)
+	if (!br_port_exists(dev))
 		return NOTIFY_DONE;
 
+	p = br_port_get(dev);
 	br = p->br;
 
 	switch (event) {
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 0f4a74bc6a9..75c90edaf7d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -15,6 +15,8 @@
 
 #include <linux/netdevice.h>
 #include <linux/if_bridge.h>
+#include <linux/netpoll.h>
+#include <linux/u64_stats_sync.h>
 #include <net/route.h>
 
 #define BR_HASH_BITS 8
@@ -143,13 +145,23 @@ struct net_bridge_port
 #ifdef CONFIG_SYSFS
 	char				sysfs_name[IFNAMSIZ];
 #endif
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	struct netpoll			*np;
+#endif
 };
 
+#define br_port_get_rcu(dev) \
+	((struct net_bridge_port *) rcu_dereference(dev->rx_handler_data))
+#define br_port_get(dev) ((struct net_bridge_port *) dev->rx_handler_data)
+#define br_port_exists(dev) (dev->priv_flags & IFF_BRIDGE_PORT)
+
 struct br_cpu_netstats {
-	unsigned long	rx_packets;
-	unsigned long	rx_bytes;
-	unsigned long	tx_packets;
-	unsigned long	tx_bytes;
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			tx_packets;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
 };
 
 struct net_bridge
@@ -164,6 +176,9 @@ struct net_bridge
 	unsigned long			feature_mask;
 #ifdef CONFIG_BRIDGE_NETFILTER
 	struct rtable 			fake_rtable;
+	bool				nf_call_iptables;
+	bool				nf_call_ip6tables;
+	bool				nf_call_arptables;
 #endif
 	unsigned long			flags;
 #define BR_SET_MAC_ADDR		0x00000001
@@ -273,16 +288,41 @@ extern void br_dev_setup(struct net_device *dev);
 extern netdev_tx_t br_dev_xmit(struct sk_buff *skb,
 			       struct net_device *dev);
 #ifdef CONFIG_NET_POLL_CONTROLLER
-extern void br_netpoll_cleanup(struct net_device *dev);
-extern void br_netpoll_enable(struct net_bridge *br,
-			      struct net_device *dev);
-extern void br_netpoll_disable(struct net_bridge *br,
-			       struct net_device *dev);
+static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br)
+{
+	return br->dev->npinfo;
+}
+
+static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
+				       struct sk_buff *skb)
+{
+	struct netpoll *np = p->np;
+
+	if (np)
+		netpoll_send_skb(np, skb);
+}
+
+extern int br_netpoll_enable(struct net_bridge_port *p);
+extern void br_netpoll_disable(struct net_bridge_port *p);
 #else
-#define br_netpoll_cleanup(br)
-#define br_netpoll_enable(br, dev)
-#define br_netpoll_disable(br, dev)
+static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br)
+{
+	return NULL;
+}
+
+static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
+				       struct sk_buff *skb)
+{
+}
 
+static inline int br_netpoll_enable(struct net_bridge_port *p)
+{
+	return 0;
+}
+
+static inline void br_netpoll_disable(struct net_bridge_port *p)
+{
+}
 #endif
 
 /* br_fdb.c */
@@ -331,8 +371,7 @@ extern void br_features_recompute(struct net_bridge *br);
 
 /* br_input.c */
 extern int br_handle_frame_finish(struct sk_buff *skb);
-extern struct sk_buff *br_handle_frame(struct net_bridge_port *p,
-				       struct sk_buff *skb);
+extern struct sk_buff *br_handle_frame(struct sk_buff *skb);
 
 /* br_ioctl.c */
 extern int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c
index 217bd225a42..35cf27087b5 100644
--- a/net/bridge/br_stp_bpdu.c
+++ b/net/bridge/br_stp_bpdu.c
@@ -131,18 +131,19 @@ void br_send_tcn_bpdu(struct net_bridge_port *p)
 /*
  * Called from llc.
  *
- * NO locks, but rcu_read_lock (preempt_disabled)
+ * NO locks, but rcu_read_lock
  */
 void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
 		struct net_device *dev)
 {
 	const unsigned char *dest = eth_hdr(skb)->h_dest;
-	struct net_bridge_port *p = rcu_dereference(dev->br_port);
+	struct net_bridge_port *p;
 	struct net_bridge *br;
 	const unsigned char *buf;
 
-	if (!p)
+	if (!br_port_exists(dev))
 		goto err;
+	p = br_port_get_rcu(dev);
 
 	if (!pskb_may_pull(skb, 4))
 		goto err;
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 486b8f3861d..5c1e5559ebb 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -611,6 +611,73 @@ static DEVICE_ATTR(multicast_startup_query_interval, S_IRUGO | S_IWUSR,
 		   show_multicast_startup_query_interval,
 		   store_multicast_startup_query_interval);
 #endif
+#ifdef CONFIG_BRIDGE_NETFILTER
+static ssize_t show_nf_call_iptables(
+	struct device *d, struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%u\n", br->nf_call_iptables);
+}
+
+static int set_nf_call_iptables(struct net_bridge *br, unsigned long val)
+{
+	br->nf_call_iptables = val ? true : false;
+	return 0;
+}
+
+static ssize_t store_nf_call_iptables(
+	struct device *d, struct device_attribute *attr, const char *buf,
+	size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_nf_call_iptables);
+}
+static DEVICE_ATTR(nf_call_iptables, S_IRUGO | S_IWUSR,
+		   show_nf_call_iptables, store_nf_call_iptables);
+
+static ssize_t show_nf_call_ip6tables(
+	struct device *d, struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%u\n", br->nf_call_ip6tables);
+}
+
+static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val)
+{
+	br->nf_call_ip6tables = val ? true : false;
+	return 0;
+}
+
+static ssize_t store_nf_call_ip6tables(
+	struct device *d, struct device_attribute *attr, const char *buf,
+	size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_nf_call_ip6tables);
+}
+static DEVICE_ATTR(nf_call_ip6tables, S_IRUGO | S_IWUSR,
+		   show_nf_call_ip6tables, store_nf_call_ip6tables);
+
+static ssize_t show_nf_call_arptables(
+	struct device *d, struct device_attribute *attr, char *buf)
+{
+	struct net_bridge *br = to_bridge(d);
+	return sprintf(buf, "%u\n", br->nf_call_arptables);
+}
+
+static int set_nf_call_arptables(struct net_bridge *br, unsigned long val)
+{
+	br->nf_call_arptables = val ? true : false;
+	return 0;
+}
+
+static ssize_t store_nf_call_arptables(
+	struct device *d, struct device_attribute *attr, const char *buf,
+	size_t len)
+{
+	return store_bridge_parm(d, buf, len, set_nf_call_arptables);
+}
+static DEVICE_ATTR(nf_call_arptables, S_IRUGO | S_IWUSR,
+		   show_nf_call_arptables, store_nf_call_arptables);
+#endif
 
 static struct attribute *bridge_attrs[] = {
 	&dev_attr_forward_delay.attr,
@@ -645,6 +712,11 @@ static struct attribute *bridge_attrs[] = {
 	&dev_attr_multicast_query_response_interval.attr,
 	&dev_attr_multicast_startup_query_interval.attr,
 #endif
+#ifdef CONFIG_BRIDGE_NETFILTER
+	&dev_attr_nf_call_iptables.attr,
+	&dev_attr_nf_call_ip6tables.attr,
+	&dev_attr_nf_call_arptables.attr,
+#endif
 	NULL
 };
 
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 9e19166ba45..46624bb6d9b 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -24,8 +24,9 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
 		return EBT_DROP;
 
 	if (par->hooknum != NF_BR_BROUTING)
+		/* rcu_read_lock()ed by nf_hook_slow */
 		memcpy(eth_hdr(skb)->h_dest,
-		       par->in->br_port->br->dev->dev_addr, ETH_ALEN);
+		       br_port_get_rcu(par->in)->br->dev->dev_addr, ETH_ALEN);
 	else
 		memcpy(eth_hdr(skb)->h_dest, par->in->dev_addr, ETH_ALEN);
 	skb->pkt_type = PACKET_HOST;
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index ae3c7cef148..26377e96fa1 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -177,8 +177,9 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
 	if (in) {
 		strcpy(pm->physindev, in->name);
 		/* If in isn't a bridge, then physindev==indev */
-		if (in->br_port)
-			strcpy(pm->indev, in->br_port->br->dev->name);
+		if (br_port_exists(in))
+			/* rcu_read_lock()ed by nf_hook_slow */
+			strcpy(pm->indev, br_port_get_rcu(in)->br->dev->name);
 		else
 			strcpy(pm->indev, in->name);
 	} else
@@ -187,7 +188,8 @@ static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb,
 	if (out) {
 		/* If out exists, then out is a bridge port */
 		strcpy(pm->physoutdev, out->name);
-		strcpy(pm->outdev, out->br_port->br->dev->name);
+		/* rcu_read_lock()ed by nf_hook_slow */
+		strcpy(pm->outdev, br_port_get_rcu(out)->br->dev->name);
 	} else
 		pm->outdev[0] = pm->physoutdev[0] = '\0';
 
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index 87b53b3a921..eae67bf0446 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -39,8 +39,6 @@ static bool
 ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct ebt_vlan_info *info = par->matchinfo;
-	const struct vlan_hdr *fp;
-	struct vlan_hdr _frame;
 
 	unsigned short TCI;	/* Whole TCI, given from parsed frame */
 	unsigned short id;	/* VLAN ID, given from frame TCI */
@@ -48,9 +46,20 @@ ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	/* VLAN encapsulated Type/Length field, given from orig frame */
 	__be16 encap;
 
-	fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame);
-	if (fp == NULL)
-		return false;
+	if (vlan_tx_tag_present(skb)) {
+		TCI = vlan_tx_tag_get(skb);
+		encap = skb->protocol;
+	} else {
+		const struct vlan_hdr *fp;
+		struct vlan_hdr _frame;
+
+		fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame);
+		if (fp == NULL)
+			return false;
+
+		TCI = ntohs(fp->h_vlan_TCI);
+		encap = fp->h_vlan_encapsulated_proto;
+	}
 
 	/* Tag Control Information (TCI) consists of the following elements:
 	 * - User_priority. The user_priority field is three bits in length,
@@ -59,10 +68,8 @@ ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	 * (CFI) is a single bit flag value. Currently ignored.
 	 * - VLAN Identifier (VID). The VID is encoded as
 	 * an unsigned binary number. */
-	TCI = ntohs(fp->h_vlan_TCI);
 	id = TCI & VLAN_VID_MASK;
 	prio = (TCI >> 13) & 0x7;
-	encap = fp->h_vlan_encapsulated_proto;
 
 	/* Checking VLAN Identifier (VID) */
 	if (GET_BITMASK(EBT_VLAN_ID))
@@ -111,10 +118,10 @@ static int ebt_vlan_mt_check(const struct xt_mtchk_param *par)
 	 * 0 - The null VLAN ID.
 	 * 1 - The default Port VID (PVID)
 	 * 0x0FFF - Reserved for implementation use.
-	 * if_vlan.h: VLAN_GROUP_ARRAY_LEN 4096. */
+	 * if_vlan.h: VLAN_N_VID 4096. */
 	if (GET_BITMASK(EBT_VLAN_ID)) {
 		if (!!info->id) { /* if id!=0 => check vid range */
-			if (info->id > VLAN_GROUP_ARRAY_LEN) {
+			if (info->id > VLAN_N_VID) {
 				pr_debug("id %d is out of range (1-4096)\n",
 					 info->id);
 				return -EINVAL;
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 59ca00e40de..a1dcf83f0d5 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -124,27 +124,37 @@ ebt_dev_check(const char *entry, const struct net_device *device)
 #define FWINV2(bool,invflg) ((bool) ^ !!(e->invflags & invflg))
 /* process standard matches */
 static inline int
-ebt_basic_match(const struct ebt_entry *e, const struct ethhdr *h,
+ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
                 const struct net_device *in, const struct net_device *out)
 {
+	const struct ethhdr *h = eth_hdr(skb);
+	__be16 ethproto;
 	int verdict, i;
 
+	if (vlan_tx_tag_present(skb))
+		ethproto = htons(ETH_P_8021Q);
+	else
+		ethproto = h->h_proto;
+
 	if (e->bitmask & EBT_802_3) {
-		if (FWINV2(ntohs(h->h_proto) >= 1536, EBT_IPROTO))
+		if (FWINV2(ntohs(ethproto) >= 1536, EBT_IPROTO))
 			return 1;
 	} else if (!(e->bitmask & EBT_NOPROTO) &&
-	   FWINV2(e->ethproto != h->h_proto, EBT_IPROTO))
+	   FWINV2(e->ethproto != ethproto, EBT_IPROTO))
 		return 1;
 
 	if (FWINV2(ebt_dev_check(e->in, in), EBT_IIN))
 		return 1;
 	if (FWINV2(ebt_dev_check(e->out, out), EBT_IOUT))
 		return 1;
-	if ((!in || !in->br_port) ? 0 : FWINV2(ebt_dev_check(
-	   e->logical_in, in->br_port->br->dev), EBT_ILOGICALIN))
+	/* rcu_read_lock()ed by nf_hook_slow */
+	if (in && br_port_exists(in) &&
+	    FWINV2(ebt_dev_check(e->logical_in, br_port_get_rcu(in)->br->dev),
+		   EBT_ILOGICALIN))
 		return 1;
-	if ((!out || !out->br_port) ? 0 : FWINV2(ebt_dev_check(
-	   e->logical_out, out->br_port->br->dev), EBT_ILOGICALOUT))
+	if (out && br_port_exists(out) &&
+	    FWINV2(ebt_dev_check(e->logical_out, br_port_get_rcu(out)->br->dev),
+		   EBT_ILOGICALOUT))
 		return 1;
 
 	if (e->bitmask & EBT_SOURCEMAC) {
@@ -210,7 +220,7 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb,
 	base = private->entries;
 	i = 0;
 	while (i < nentries) {
-		if (ebt_basic_match(point, eth_hdr(skb), in, out))
+		if (ebt_basic_match(point, skb, in, out))
 			goto letscontinue;
 
 		if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &acpar) != 0)
diff --git a/net/caif/Kconfig b/net/caif/Kconfig
index ed651786f16..529750da962 100644
--- a/net/caif/Kconfig
+++ b/net/caif/Kconfig
@@ -21,19 +21,18 @@ menuconfig CAIF
 	See Documentation/networking/caif for a further explanation on how to
 	use and configure CAIF.
 
-if CAIF
-
 config  CAIF_DEBUG
 	bool "Enable Debug"
+	depends on CAIF
 	default n
 	--- help ---
 	Enable the inclusion of debug code in the CAIF stack.
 	Be aware that doing this will impact performance.
 	If unsure say N.
 
-
 config CAIF_NETDEV
 	tristate "CAIF GPRS Network device"
+	depends on CAIF
 	default CAIF
 	---help---
 	Say Y if you will be using a CAIF based GPRS network device.
@@ -41,5 +40,3 @@ config CAIF_NETDEV
 	If you select to build it as a built-in then the main CAIF device must
 	also be a built-in.
 	If unsure say Y.
-
-endif
diff --git a/net/caif/Makefile b/net/caif/Makefile
index 34852af2595..f87481fb0e6 100644
--- a/net/caif/Makefile
+++ b/net/caif/Makefile
@@ -1,23 +1,13 @@
-ifeq ($(CONFIG_CAIF_DEBUG),1)
-CAIF_DBG_FLAGS := -DDEBUG
+ifeq ($(CONFIG_CAIF_DEBUG),y)
+EXTRA_CFLAGS += -DDEBUG
 endif
 
-ccflags-y := $(CAIF_FLAGS) $(CAIF_DBG_FLAGS)
-
 caif-objs := caif_dev.o \
 	cfcnfg.o cfmuxl.o cfctrl.o  \
 	cffrml.o cfveil.o cfdbgl.o\
 	cfserl.o cfdgml.o  \
 	cfrfml.o cfvidl.o cfutill.o \
 	cfsrvl.o cfpkt_skbuff.o caif_config_util.o
-clean-dirs:= .tmp_versions
-
-clean-files:= \
-	Module.symvers \
-	modules.order \
-	*.cmd \
-	*.o \
-	*~
 
 obj-$(CONFIG_CAIF) += caif.o
 obj-$(CONFIG_CAIF_NETDEV) += chnl_net.o
diff --git a/net/caif/caif_config_util.c b/net/caif/caif_config_util.c
index 6f36580366f..76ae68303d3 100644
--- a/net/caif/caif_config_util.c
+++ b/net/caif/caif_config_util.c
@@ -80,6 +80,11 @@ int connect_req_to_link_param(struct cfcnfg *cnfg,
 		       l->u.utility.paramlen);
 
 		break;
+	case CAIFPROTO_DEBUG:
+		l->linktype = CFCTRL_SRV_DBG;
+		l->endpoint = s->sockaddr.u.dbg.service;
+		l->chtype = s->sockaddr.u.dbg.type;
+		break;
 	default:
 		return -EINVAL;
 	}
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index e2b86f1f5a4..b99369a055d 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -9,6 +9,8 @@
  *  and Sakari Ailus <sakari.ailus@nokia.com>
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
 #include <linux/version.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
@@ -171,7 +173,7 @@ static int receive(struct sk_buff *skb, struct net_device *dev,
 	net = dev_net(dev);
 	pkt = cfpkt_fromnative(CAIF_DIR_IN, skb);
 	caifd = caif_get(dev);
-	if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd)
+	if (!caifd || !caifd->layer.up || !caifd->layer.up->receive)
 		return NET_RX_DROP;
 
 	if (caifd->layer.up->receive(caifd->layer.up, pkt))
@@ -214,7 +216,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
 
 	switch (what) {
 	case NETDEV_REGISTER:
-		pr_info("CAIF: %s():register %s\n", __func__, dev->name);
+		netdev_info(dev, "register\n");
 		caifd = caif_device_alloc(dev);
 		if (caifd == NULL)
 			break;
@@ -225,14 +227,13 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
 		break;
 
 	case NETDEV_UP:
-		pr_info("CAIF: %s(): up %s\n", __func__, dev->name);
+		netdev_info(dev, "up\n");
 		caifd = caif_get(dev);
 		if (caifd == NULL)
 			break;
 		caifdev = netdev_priv(dev);
 		if (atomic_read(&caifd->state) == NETDEV_UP) {
-			pr_info("CAIF: %s():%s already up\n",
-				__func__, dev->name);
+			netdev_info(dev, "already up\n");
 			break;
 		}
 		atomic_set(&caifd->state, what);
@@ -255,7 +256,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
 			pref = CFPHYPREF_HIGH_BW;
 			break;
 		}
-
+		dev_hold(dev);
 		cfcnfg_add_phy_layer(get_caif_conf(),
 				     phy_type,
 				     dev,
@@ -273,7 +274,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
 		caifd = caif_get(dev);
 		if (caifd == NULL)
 			break;
-		pr_info("CAIF: %s():going down %s\n", __func__, dev->name);
+		netdev_info(dev, "going down\n");
 
 		if (atomic_read(&caifd->state) == NETDEV_GOING_DOWN ||
 			atomic_read(&caifd->state) == NETDEV_DOWN)
@@ -285,6 +286,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
 		caifd->layer.up->ctrlcmd(caifd->layer.up,
 					 _CAIF_CTRLCMD_PHYIF_DOWN_IND,
 					 caifd->layer.id);
+		might_sleep();
 		res = wait_event_interruptible_timeout(caifd->event,
 					atomic_read(&caifd->in_use) == 0,
 					TIMEOUT);
@@ -294,18 +296,18 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
 		caifd = caif_get(dev);
 		if (caifd == NULL)
 			break;
-		pr_info("CAIF: %s(): down %s\n", __func__, dev->name);
+		netdev_info(dev, "down\n");
 		if (atomic_read(&caifd->in_use))
-			pr_warning("CAIF: %s(): "
-				   "Unregistering an active CAIF device: %s\n",
-				   __func__, dev->name);
+			netdev_warn(dev,
+				    "Unregistering an active CAIF device\n");
 		cfcnfg_del_phy_layer(get_caif_conf(), &caifd->layer);
+		dev_put(dev);
 		atomic_set(&caifd->state, what);
 		break;
 
 	case NETDEV_UNREGISTER:
 		caifd = caif_get(dev);
-		pr_info("CAIF: %s(): unregister %s\n", __func__, dev->name);
+		netdev_info(dev, "unregister\n");
 		atomic_set(&caifd->state, what);
 		caif_device_destroy(dev);
 		break;
@@ -326,7 +328,8 @@ struct cfcnfg *get_caif_conf(void)
 EXPORT_SYMBOL(get_caif_conf);
 
 int caif_connect_client(struct caif_connect_request *conn_req,
-			   struct cflayer *client_layer)
+			struct cflayer *client_layer, int *ifindex,
+			int *headroom, int *tailroom)
 {
 	struct cfctrl_link_param param;
 	int ret;
@@ -334,8 +337,9 @@ int caif_connect_client(struct caif_connect_request *conn_req,
 	if (ret)
 		return ret;
 	/* Hook up the adaptation layer. */
-	return cfcnfg_add_adaptation_layer(get_caif_conf(),
-						&param, client_layer);
+	return cfcnfg_add_adaptation_layer(get_caif_conf(), &param,
+					client_layer, ifindex,
+					headroom, tailroom);
 }
 EXPORT_SYMBOL(caif_connect_client);
 
@@ -387,7 +391,7 @@ static int __init caif_device_init(void)
 	int result;
 	cfg = cfcnfg_create();
 	if (!cfg) {
-		pr_warning("CAIF: %s(): can't create cfcnfg.\n", __func__);
+		pr_warn("can't create cfcnfg\n");
 		goto err_cfcnfg_create_failed;
 	}
 	result = register_pernet_device(&caif_net_ops);
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 3d0e09584fa..2eca2dd0000 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -4,6 +4,8 @@
  * License terms: GNU General Public License (GPL) version 2
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -15,7 +17,6 @@
 #include <linux/poll.h>
 #include <linux/tcp.h>
 #include <linux/uaccess.h>
-#include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/caif/caif_socket.h>
 #include <asm/atomic.h>
@@ -28,9 +29,6 @@
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_NETPROTO(AF_CAIF);
 
-#define CAIF_DEF_SNDBUF (CAIF_MAX_PAYLOAD_SIZE*10)
-#define CAIF_DEF_RCVBUF (CAIF_MAX_PAYLOAD_SIZE*100)
-
 /*
  * CAIF state is re-using the TCP socket states.
  * caif_states stored in sk_state reflect the state as reported by
@@ -76,6 +74,7 @@ struct caifsock {
 	struct caif_connect_request conn_req;
 	struct mutex readlock;
 	struct dentry *debugfs_socket_dir;
+	int headroom, tailroom, maxframe;
 };
 
 static int rx_flow_is_on(struct caifsock *cf_sk)
@@ -156,9 +155,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 
 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 		(unsigned)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) {
-		trace_printk("CAIF: %s():"
-			" sending flow OFF (queue len = %d %d)\n",
-			__func__,
+		pr_debug("sending flow OFF (queue len = %d %d)\n",
 			atomic_read(&cf_sk->sk.sk_rmem_alloc),
 			sk_rcvbuf_lowwater(cf_sk));
 		set_rx_flow_off(cf_sk);
@@ -171,9 +168,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		return err;
 	if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) {
 		set_rx_flow_off(cf_sk);
-		trace_printk("CAIF: %s():"
-			" sending flow OFF due to rmem_schedule\n",
-			__func__);
+		pr_debug("sending flow OFF due to rmem_schedule\n");
 		dbfs_atomic_inc(&cnt.num_rx_flow_off);
 		caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ);
 	}
@@ -274,8 +269,7 @@ static void caif_ctrl_cb(struct cflayer *layr,
 		break;
 
 	default:
-		pr_debug("CAIF: %s(): Unexpected flow command %d\n",
-				__func__, flow);
+		pr_debug("Unexpected flow command %d\n", flow);
 	}
 }
 
@@ -535,8 +529,7 @@ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk,
 
 		/* Slight paranoia, probably not needed. */
 		if (unlikely(loopcnt++ > 1000)) {
-			pr_warning("CAIF: %s(): transmit retries failed,"
-				" error = %d\n", __func__, ret);
+			pr_warn("transmit retries failed, error = %d\n", ret);
 			break;
 		}
 
@@ -594,27 +587,32 @@ static int caif_seqpkt_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		goto err;
 	noblock = msg->msg_flags & MSG_DONTWAIT;
 
-	buffer_size = len + CAIF_NEEDED_HEADROOM + CAIF_NEEDED_TAILROOM;
-
-	ret = -EMSGSIZE;
-	if (buffer_size > CAIF_MAX_PAYLOAD_SIZE)
-		goto err;
-
 	timeo = sock_sndtimeo(sk, noblock);
 	timeo = caif_wait_for_flow_on(container_of(sk, struct caifsock, sk),
 				1, timeo, &ret);
 
+	if (ret)
+		goto err;
 	ret = -EPIPE;
 	if (cf_sk->sk.sk_state != CAIF_CONNECTED ||
 		sock_flag(sk, SOCK_DEAD) ||
 		(sk->sk_shutdown & RCV_SHUTDOWN))
 		goto err;
 
+	/* Error if trying to write more than maximum frame size. */
+	ret = -EMSGSIZE;
+	if (len > cf_sk->maxframe && cf_sk->sk.sk_protocol != CAIFPROTO_RFM)
+		goto err;
+
+	buffer_size = len + cf_sk->headroom + cf_sk->tailroom;
+
 	ret = -ENOMEM;
 	skb = sock_alloc_send_skb(sk, buffer_size, noblock, &ret);
-	if (!skb)
+
+	if (!skb || skb_tailroom(skb) < buffer_size)
 		goto err;
-	skb_reserve(skb, CAIF_NEEDED_HEADROOM);
+
+	skb_reserve(skb, cf_sk->headroom);
 
 	ret = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
 
@@ -645,7 +643,6 @@ static int caif_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	long timeo;
 
 	err = -EOPNOTSUPP;
-
 	if (unlikely(msg->msg_flags&MSG_OOB))
 		goto out_err;
 
@@ -662,8 +659,8 @@ static int caif_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 
 		size = len-sent;
 
-		if (size > CAIF_MAX_PAYLOAD_SIZE)
-			size = CAIF_MAX_PAYLOAD_SIZE;
+		if (size > cf_sk->maxframe)
+			size = cf_sk->maxframe;
 
 		/* If size is more than half of sndbuf, chop up message */
 		if (size > ((sk->sk_sndbuf >> 1) - 64))
@@ -673,14 +670,14 @@ static int caif_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 			size = SKB_MAX_ALLOC;
 
 		skb = sock_alloc_send_skb(sk,
-					size + CAIF_NEEDED_HEADROOM
-					+ CAIF_NEEDED_TAILROOM,
+					size + cf_sk->headroom +
+					cf_sk->tailroom,
 					msg->msg_flags&MSG_DONTWAIT,
 					&err);
 		if (skb == NULL)
 			goto out_err;
 
-		skb_reserve(skb, CAIF_NEEDED_HEADROOM);
+		skb_reserve(skb, cf_sk->headroom);
 		/*
 		 *	If you pass two values to the sock_alloc_send_skb
 		 *	it tries to grab the large buffer with GFP_NOFS
@@ -821,17 +818,16 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr,
 	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
 	long timeo;
 	int err;
+	int ifindex, headroom, tailroom;
+	unsigned int mtu;
+	struct net_device *dev;
+
 	lock_sock(sk);
 
 	err = -EAFNOSUPPORT;
 	if (uaddr->sa_family != AF_CAIF)
 		goto out;
 
-	err = -ESOCKTNOSUPPORT;
-	if (unlikely(!(sk->sk_type == SOCK_STREAM &&
-		       cf_sk->sk.sk_protocol == CAIFPROTO_AT) &&
-		       sk->sk_type != SOCK_SEQPACKET))
-		goto out;
 	switch (sock->state) {
 	case SS_UNCONNECTED:
 		/* Normal case, a fresh connect */
@@ -874,8 +870,7 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr,
 	sk_stream_kill_queues(&cf_sk->sk);
 
 	err = -EINVAL;
-	if (addr_len != sizeof(struct sockaddr_caif) ||
-		!uaddr)
+	if (addr_len != sizeof(struct sockaddr_caif))
 		goto out;
 
 	memcpy(&cf_sk->conn_req.sockaddr, uaddr,
@@ -888,13 +883,31 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr,
 	dbfs_atomic_inc(&cnt.num_connect_req);
 	cf_sk->layer.receive = caif_sktrecv_cb;
 	err = caif_connect_client(&cf_sk->conn_req,
-				&cf_sk->layer);
+				&cf_sk->layer, &ifindex, &headroom, &tailroom);
 	if (err < 0) {
 		cf_sk->sk.sk_socket->state = SS_UNCONNECTED;
 		cf_sk->sk.sk_state = CAIF_DISCONNECTED;
 		goto out;
 	}
 
+	err = -ENODEV;
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
+	if (!dev) {
+		rcu_read_unlock();
+		goto out;
+	}
+	cf_sk->headroom = LL_RESERVED_SPACE_EXTRA(dev, headroom);
+	mtu = dev->mtu;
+	rcu_read_unlock();
+
+	cf_sk->tailroom = tailroom;
+	cf_sk->maxframe = mtu - (headroom + tailroom);
+	if (cf_sk->maxframe < 1) {
+		pr_warn("CAIF Interface MTU too small (%d)\n", dev->mtu);
+		goto out;
+	}
+
 	err = -EINPROGRESS;
 wait_connect:
 
@@ -1110,10 +1123,6 @@ static int caif_create(struct net *net, struct socket *sock, int protocol,
 	/* Store the protocol */
 	sk->sk_protocol = (unsigned char) protocol;
 
-	/* Sendbuf dictates the amount of outbound packets not yet sent */
-	sk->sk_sndbuf = CAIF_DEF_SNDBUF;
-	sk->sk_rcvbuf = CAIF_DEF_RCVBUF;
-
 	/*
 	 * Lock in order to try to stop someone from opening the socket
 	 * too early.
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
index df43f264d9f..41adafd1891 100644
--- a/net/caif/cfcnfg.c
+++ b/net/caif/cfcnfg.c
@@ -3,9 +3,13 @@
  * Author:	Sjur Brendeland/sjur.brandeland@stericsson.com
  * License terms: GNU General Public License (GPL) version 2
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
 #include <linux/kernel.h>
 #include <linux/stddef.h>
 #include <linux/slab.h>
+#include <linux/netdevice.h>
 #include <net/caif/caif_layer.h>
 #include <net/caif/cfpkt.h>
 #include <net/caif/cfcnfg.h>
@@ -22,6 +26,7 @@
 #define PHY_NAME_LEN 20
 
 #define container_obj(layr) container_of(layr, struct cfcnfg, layer)
+#define RFM_FRAGMENT_SIZE 4030
 
 /* Information about CAIF physical interfaces held by Config Module in order
  * to manage physical interfaces
@@ -41,6 +46,15 @@ struct cfcnfg_phyinfo {
 
 	/* Information about the physical device */
 	struct dev_info dev_info;
+
+	/* Interface index */
+	int ifindex;
+
+	/* Use Start of frame extension */
+	bool use_stx;
+
+	/* Use Start of frame checksum */
+	bool use_fcs;
 };
 
 struct cfcnfg {
@@ -67,7 +81,7 @@ struct cfcnfg *cfcnfg_create(void)
 	/* Initiate this layer */
 	this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC);
 	if (!this) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return NULL;
 	}
 	this->mux = cfmuxl_create();
@@ -95,7 +109,7 @@ struct cfcnfg *cfcnfg_create(void)
 	layer_set_up(this->ctrl, this);
 	return this;
 out_of_mem:
-	pr_warning("CAIF: %s(): Out of memory\n", __func__);
+	pr_warn("Out of memory\n");
 	kfree(this->mux);
 	kfree(this->ctrl);
 	kfree(this);
@@ -183,7 +197,7 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer)
 	caif_assert(adap_layer != NULL);
 	channel_id = adap_layer->id;
 	if (adap_layer->dn == NULL || channel_id == 0) {
-		pr_err("CAIF: %s():adap_layer->id is 0\n", __func__);
+		pr_err("adap_layer->dn == NULL or adap_layer->id is 0\n");
 		ret = -ENOTCONN;
 		goto end;
 	}
@@ -193,9 +207,8 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer)
 	layer_set_up(servl, NULL);
 	ret = cfctrl_linkdown_req(cnfg->ctrl, channel_id, adap_layer);
 	if (servl == NULL) {
-		pr_err("CAIF: %s(): PROTOCOL ERROR "
-		       "- Error removing service_layer Channel_Id(%d)",
-			__func__, channel_id);
+		pr_err("PROTOCOL ERROR - Error removing service_layer Channel_Id(%d)",
+		       channel_id);
 		ret = -EINVAL;
 		goto end;
 	}
@@ -205,18 +218,14 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer)
 
 		phyinfo = cfcnfg_get_phyinfo(cnfg, phyid);
 		if (phyinfo == NULL) {
-			pr_warning("CAIF: %s(): "
-				"No interface to send disconnect to\n",
-				__func__);
+			pr_warn("No interface to send disconnect to\n");
 			ret = -ENODEV;
 			goto end;
 		}
 		if (phyinfo->id != phyid ||
 			phyinfo->phy_layer->id != phyid ||
 			phyinfo->frm_layer->id != phyid) {
-			pr_err("CAIF: %s(): "
-				"Inconsistency in phy registration\n",
-				__func__);
+			pr_err("Inconsistency in phy registration\n");
 			ret = -EINVAL;
 			goto end;
 		}
@@ -248,27 +257,37 @@ static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id)
 {
 }
 
+int protohead[CFCTRL_SRV_MASK] = {
+	[CFCTRL_SRV_VEI] = 4,
+	[CFCTRL_SRV_DATAGRAM] = 7,
+	[CFCTRL_SRV_UTIL] = 4,
+	[CFCTRL_SRV_RFM] = 3,
+	[CFCTRL_SRV_DBG] = 3,
+};
+
 int cfcnfg_add_adaptation_layer(struct cfcnfg *cnfg,
 				struct cfctrl_link_param *param,
-				struct cflayer *adap_layer)
+				struct cflayer *adap_layer,
+				int *ifindex,
+				int *proto_head,
+				int *proto_tail)
 {
 	struct cflayer *frml;
 	if (adap_layer == NULL) {
-		pr_err("CAIF: %s(): adap_layer is zero", __func__);
+		pr_err("adap_layer is zero\n");
 		return -EINVAL;
 	}
 	if (adap_layer->receive == NULL) {
-		pr_err("CAIF: %s(): adap_layer->receive is NULL", __func__);
+		pr_err("adap_layer->receive is NULL\n");
 		return -EINVAL;
 	}
 	if (adap_layer->ctrlcmd == NULL) {
-		pr_err("CAIF: %s(): adap_layer->ctrlcmd == NULL", __func__);
+		pr_err("adap_layer->ctrlcmd == NULL\n");
 		return -EINVAL;
 	}
 	frml = cnfg->phy_layers[param->phyid].frm_layer;
 	if (frml == NULL) {
-		pr_err("CAIF: %s(): Specified PHY type does not exist!",
-			__func__);
+		pr_err("Specified PHY type does not exist!\n");
 		return -ENODEV;
 	}
 	caif_assert(param->phyid == cnfg->phy_layers[param->phyid].id);
@@ -276,6 +295,14 @@ int cfcnfg_add_adaptation_layer(struct cfcnfg *cnfg,
 		     param->phyid);
 	caif_assert(cnfg->phy_layers[param->phyid].phy_layer->id ==
 		     param->phyid);
+
+	*ifindex = cnfg->phy_layers[param->phyid].ifindex;
+	*proto_head =
+		protohead[param->linktype]+
+		(cnfg->phy_layers[param->phyid].use_stx ? 1 : 0);
+
+	*proto_tail = 2;
+
 	/* FIXME: ENUMERATE INITIALLY WHEN ACTIVATING PHYSICAL INTERFACE */
 	cfctrl_enum_req(cnfg->ctrl, param->phyid);
 	return cfctrl_linkup_request(cnfg->ctrl, param, adap_layer);
@@ -297,10 +324,10 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
 	struct cfcnfg *cnfg = container_obj(layer);
 	struct cflayer *servicel = NULL;
 	struct cfcnfg_phyinfo *phyinfo;
+	struct net_device *netdev;
+
 	if (adapt_layer == NULL) {
-		pr_debug("CAIF: %s(): link setup response "
-				"but no client exist, send linkdown back\n",
-				__func__);
+		pr_debug("link setup response but no client exist, send linkdown back\n");
 		cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL);
 		return;
 	}
@@ -308,19 +335,15 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
 	caif_assert(cnfg != NULL);
 	caif_assert(phyid != 0);
 	phyinfo = &cnfg->phy_layers[phyid];
-	caif_assert(phyinfo != NULL);
 	caif_assert(phyinfo->id == phyid);
 	caif_assert(phyinfo->phy_layer != NULL);
 	caif_assert(phyinfo->phy_layer->id == phyid);
 
-	if (phyinfo != NULL &&
-	    phyinfo->phy_ref_count++ == 0 &&
-	    phyinfo->phy_layer != NULL &&
+	phyinfo->phy_ref_count++;
+	if (phyinfo->phy_ref_count == 1 &&
 	    phyinfo->phy_layer->modemcmd != NULL) {
-		caif_assert(phyinfo->phy_layer->id == phyid);
 		phyinfo->phy_layer->modemcmd(phyinfo->phy_layer,
 					     _CAIF_MODEMCMD_PHYIF_USEFULL);
-
 	}
 	adapt_layer->id = channel_id;
 
@@ -332,7 +355,9 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
 		servicel = cfdgml_create(channel_id, &phyinfo->dev_info);
 		break;
 	case CFCTRL_SRV_RFM:
-		servicel = cfrfml_create(channel_id, &phyinfo->dev_info);
+		netdev = phyinfo->dev_info.dev;
+		servicel = cfrfml_create(channel_id, &phyinfo->dev_info,
+						netdev->mtu);
 		break;
 	case CFCTRL_SRV_UTIL:
 		servicel = cfutill_create(channel_id, &phyinfo->dev_info);
@@ -344,13 +369,11 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
 		servicel = cfdbgl_create(channel_id, &phyinfo->dev_info);
 		break;
 	default:
-		pr_err("CAIF: %s(): Protocol error. "
-			"Link setup response - unknown channel type\n",
-			__func__);
+		pr_err("Protocol error. Link setup response - unknown channel type\n");
 		return;
 	}
 	if (!servicel) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return;
 	}
 	layer_set_dn(servicel, cnfg->mux);
@@ -363,8 +386,8 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv,
 
 void
 cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type,
-		     void *dev, struct cflayer *phy_layer, u16 *phyid,
-		     enum cfcnfg_phy_preference pref,
+		     struct net_device *dev, struct cflayer *phy_layer,
+		     u16 *phyid, enum cfcnfg_phy_preference pref,
 		     bool fcs, bool stx)
 {
 	struct cflayer *frml;
@@ -388,7 +411,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type,
 		}
 	}
 	if (*phyid == 0) {
-		pr_err("CAIF: %s(): No Available PHY ID\n", __func__);
+		pr_err("No Available PHY ID\n");
 		return;
 	}
 
@@ -397,7 +420,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type,
 		phy_driver =
 		    cfserl_create(CFPHYTYPE_FRAG, *phyid, stx);
 		if (!phy_driver) {
-			pr_warning("CAIF: %s(): Out of memory\n", __func__);
+			pr_warn("Out of memory\n");
 			return;
 		}
 
@@ -406,7 +429,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type,
 		phy_driver = NULL;
 		break;
 	default:
-		pr_err("CAIF: %s(): %d", __func__, phy_type);
+		pr_err("%d\n", phy_type);
 		return;
 		break;
 	}
@@ -418,10 +441,14 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type,
 	cnfg->phy_layers[*phyid].dev_info.dev = dev;
 	cnfg->phy_layers[*phyid].phy_layer = phy_layer;
 	cnfg->phy_layers[*phyid].phy_ref_count = 0;
+	cnfg->phy_layers[*phyid].ifindex = dev->ifindex;
+	cnfg->phy_layers[*phyid].use_stx = stx;
+	cnfg->phy_layers[*phyid].use_fcs = fcs;
+
 	phy_layer->type = phy_type;
 	frml = cffrml_create(*phyid, fcs);
 	if (!frml) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return;
 	}
 	cnfg->phy_layers[*phyid].frm_layer = frml;
diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c
index fcfda98a5e6..08f267a109a 100644
--- a/net/caif/cfctrl.c
+++ b/net/caif/cfctrl.c
@@ -4,6 +4,8 @@
  * License terms: GNU General Public License (GPL) version 2
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
 #include <linux/stddef.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
@@ -19,7 +21,7 @@
 #ifdef CAIF_NO_LOOP
 static int handle_loop(struct cfctrl *ctrl,
 			      int cmd, struct cfpkt *pkt){
-	return CAIF_FAILURE;
+	return -1;
 }
 #else
 static int handle_loop(struct cfctrl *ctrl,
@@ -36,14 +38,14 @@ struct cflayer *cfctrl_create(void)
 	struct cfctrl *this =
 		kmalloc(sizeof(struct cfctrl), GFP_ATOMIC);
 	if (!this) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return NULL;
 	}
 	caif_assert(offsetof(struct cfctrl, serv.layer) == 0);
 	memset(&dev_info, 0, sizeof(dev_info));
 	dev_info.id = 0xff;
 	memset(this, 0, sizeof(*this));
-	cfsrvl_init(&this->serv, 0, &dev_info);
+	cfsrvl_init(&this->serv, 0, &dev_info, false);
 	atomic_set(&this->req_seq_no, 1);
 	atomic_set(&this->rsp_seq_no, 1);
 	this->serv.layer.receive = cfctrl_recv;
@@ -132,9 +134,7 @@ struct cfctrl_request_info *cfctrl_remove_req(struct cfctrl *ctrl,
 	list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
 		if (cfctrl_req_eq(req, p)) {
 			if (p != first)
-				pr_warning("CAIF: %s(): Requests are not "
-					"received in order\n",
-					__func__);
+				pr_warn("Requests are not received in order\n");
 
 			atomic_set(&ctrl->rsp_seq_no,
 					 p->sequence_no);
@@ -177,7 +177,7 @@ void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid)
 	int ret;
 	struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
 	if (!pkt) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return;
 	}
 	caif_assert(offsetof(struct cfctrl, serv.layer) == 0);
@@ -189,8 +189,7 @@ void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid)
 	ret =
 	    cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt);
 	if (ret < 0) {
-		pr_err("CAIF: %s(): Could not transmit enum message\n",
-			__func__);
+		pr_err("Could not transmit enum message\n");
 		cfpkt_destroy(pkt);
 	}
 }
@@ -208,7 +207,7 @@ int cfctrl_linkup_request(struct cflayer *layer,
 	char utility_name[16];
 	struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
 	if (!pkt) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return -ENOMEM;
 	}
 	cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_SETUP);
@@ -253,13 +252,13 @@ int cfctrl_linkup_request(struct cflayer *layer,
 			       param->u.utility.paramlen);
 		break;
 	default:
-		pr_warning("CAIF: %s():Request setup of bad link type = %d\n",
-			   __func__, param->linktype);
+		pr_warn("Request setup of bad link type = %d\n",
+			param->linktype);
 		return -EINVAL;
 	}
 	req = kzalloc(sizeof(*req), GFP_KERNEL);
 	if (!req) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return -ENOMEM;
 	}
 	req->client_layer = user_layer;
@@ -276,8 +275,7 @@ int cfctrl_linkup_request(struct cflayer *layer,
 	ret =
 	    cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt);
 	if (ret < 0) {
-		pr_err("CAIF: %s(): Could not transmit linksetup request\n",
-			__func__);
+		pr_err("Could not transmit linksetup request\n");
 		cfpkt_destroy(pkt);
 		return -ENODEV;
 	}
@@ -291,7 +289,7 @@ int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid,
 	struct cfctrl *cfctrl = container_obj(layer);
 	struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
 	if (!pkt) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return -ENOMEM;
 	}
 	cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_DESTROY);
@@ -300,8 +298,7 @@ int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid,
 	ret =
 	    cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt);
 	if (ret < 0) {
-		pr_err("CAIF: %s(): Could not transmit link-down request\n",
-			__func__);
+		pr_err("Could not transmit link-down request\n");
 		cfpkt_destroy(pkt);
 	}
 	return ret;
@@ -313,7 +310,7 @@ void cfctrl_sleep_req(struct cflayer *layer)
 	struct cfctrl *cfctrl = container_obj(layer);
 	struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
 	if (!pkt) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return;
 	}
 	cfpkt_addbdy(pkt, CFCTRL_CMD_SLEEP);
@@ -330,7 +327,7 @@ void cfctrl_wake_req(struct cflayer *layer)
 	struct cfctrl *cfctrl = container_obj(layer);
 	struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
 	if (!pkt) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return;
 	}
 	cfpkt_addbdy(pkt, CFCTRL_CMD_WAKE);
@@ -347,7 +344,7 @@ void cfctrl_getstartreason_req(struct cflayer *layer)
 	struct cfctrl *cfctrl = container_obj(layer);
 	struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN);
 	if (!pkt) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return;
 	}
 	cfpkt_addbdy(pkt, CFCTRL_CMD_START_REASON);
@@ -364,12 +361,11 @@ void cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer)
 	struct cfctrl_request_info *p, *tmp;
 	struct cfctrl *ctrl = container_obj(layr);
 	spin_lock(&ctrl->info_list_lock);
-	pr_warning("CAIF: %s(): enter\n", __func__);
+	pr_warn("enter\n");
 
 	list_for_each_entry_safe(p, tmp, &ctrl->list, list) {
 		if (p->client_layer == adap_layer) {
-			pr_warning("CAIF: %s(): cancel req :%d\n", __func__,
-					p->sequence_no);
+			pr_warn("cancel req :%d\n", p->sequence_no);
 			list_del(&p->list);
 			kfree(p);
 		}
@@ -395,7 +391,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
 	cmd = cmdrsp & CFCTRL_CMD_MASK;
 	if (cmd != CFCTRL_CMD_LINK_ERR
 	    && CFCTRL_RSP_BIT != (CFCTRL_RSP_BIT & cmdrsp)) {
-		if (handle_loop(cfctrl, cmd, pkt) == CAIF_FAILURE)
+		if (handle_loop(cfctrl, cmd, pkt) != 0)
 			cmdrsp |= CFCTRL_ERR_BIT;
 	}
 
@@ -520,9 +516,8 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
 				cfpkt_extr_head(pkt, &param, len);
 				break;
 			default:
-				pr_warning("CAIF: %s(): Request setup "
-					   "- invalid link type (%d)",
-					   __func__, serv);
+				pr_warn("Request setup - invalid link type (%d)\n",
+					serv);
 				goto error;
 			}
 
@@ -532,9 +527,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
 
 			if (CFCTRL_ERR_BIT == (CFCTRL_ERR_BIT & cmdrsp) ||
 				cfpkt_erroneous(pkt)) {
-				pr_err("CAIF: %s(): Invalid O/E bit or parse "
-				       "error on CAIF control channel",
-					__func__);
+				pr_err("Invalid O/E bit or parse error on CAIF control channel\n");
 				cfctrl->res.reject_rsp(cfctrl->serv.layer.up,
 						       0,
 						       req ? req->client_layer
@@ -556,8 +549,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
 		cfctrl->res.linkdestroy_rsp(cfctrl->serv.layer.up, linkid);
 		break;
 	case CFCTRL_CMD_LINK_ERR:
-		pr_err("CAIF: %s(): Frame Error Indication received\n",
-			__func__);
+		pr_err("Frame Error Indication received\n");
 		cfctrl->res.linkerror_ind();
 		break;
 	case CFCTRL_CMD_ENUM:
@@ -576,7 +568,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
 		cfctrl->res.radioset_rsp();
 		break;
 	default:
-		pr_err("CAIF: %s(): Unrecognized Control Frame\n", __func__);
+		pr_err("Unrecognized Control Frame\n");
 		goto error;
 		break;
 	}
@@ -595,8 +587,7 @@ static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
 	case CAIF_CTRLCMD_FLOW_OFF_IND:
 		spin_lock(&this->info_list_lock);
 		if (!list_empty(&this->list)) {
-			pr_debug("CAIF: %s(): Received flow off in "
-				   "control layer", __func__);
+			pr_debug("Received flow off in control layer\n");
 		}
 		spin_unlock(&this->info_list_lock);
 		break;
@@ -620,7 +611,7 @@ static int handle_loop(struct cfctrl *ctrl, int cmd, struct cfpkt *pkt)
 			if (!ctrl->loop_linkused[linkid])
 				goto found;
 		spin_unlock(&ctrl->loop_linkid_lock);
-		pr_err("CAIF: %s(): Out of link-ids\n", __func__);
+		pr_err("Out of link-ids\n");
 		return -EINVAL;
 found:
 		if (!ctrl->loop_linkused[linkid])
@@ -647,6 +638,6 @@ found:
 	default:
 		break;
 	}
-	return CAIF_SUCCESS;
+	return 0;
 }
 #endif
diff --git a/net/caif/cfdbgl.c b/net/caif/cfdbgl.c
index ab6b6dc34cf..496fda9ac66 100644
--- a/net/caif/cfdbgl.c
+++ b/net/caif/cfdbgl.c
@@ -4,6 +4,8 @@
  * License terms: GNU General Public License (GPL) version 2
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
 #include <linux/stddef.h>
 #include <linux/slab.h>
 #include <net/caif/caif_layer.h>
@@ -17,12 +19,12 @@ struct cflayer *cfdbgl_create(u8 channel_id, struct dev_info *dev_info)
 {
 	struct cfsrvl *dbg = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
 	if (!dbg) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return NULL;
 	}
 	caif_assert(offsetof(struct cfsrvl, layer) == 0);
 	memset(dbg, 0, sizeof(struct cfsrvl));
-	cfsrvl_init(dbg, channel_id, dev_info);
+	cfsrvl_init(dbg, channel_id, dev_info, false);
 	dbg->layer.receive = cfdbgl_receive;
 	dbg->layer.transmit = cfdbgl_transmit;
 	snprintf(dbg->layer.name, CAIF_LAYER_NAME_SZ - 1, "dbg%d", channel_id);
diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c
index 53194840ecb..d3ed264ad6c 100644
--- a/net/caif/cfdgml.c
+++ b/net/caif/cfdgml.c
@@ -4,6 +4,8 @@
  * License terms: GNU General Public License (GPL) version 2
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
 #include <linux/stddef.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
@@ -17,6 +19,7 @@
 #define DGM_FLOW_OFF 0x81
 #define DGM_FLOW_ON  0x80
 #define DGM_CTRL_PKT_SIZE 1
+#define DGM_MTU 1500
 
 static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt);
 static int cfdgml_transmit(struct cflayer *layr, struct cfpkt *pkt);
@@ -25,12 +28,12 @@ struct cflayer *cfdgml_create(u8 channel_id, struct dev_info *dev_info)
 {
 	struct cfsrvl *dgm = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
 	if (!dgm) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return NULL;
 	}
 	caif_assert(offsetof(struct cfsrvl, layer) == 0);
 	memset(dgm, 0, sizeof(struct cfsrvl));
-	cfsrvl_init(dgm, channel_id, dev_info);
+	cfsrvl_init(dgm, channel_id, dev_info, true);
 	dgm->layer.receive = cfdgml_receive;
 	dgm->layer.transmit = cfdgml_transmit;
 	snprintf(dgm->layer.name, CAIF_LAYER_NAME_SZ - 1, "dgm%d", channel_id);
@@ -48,14 +51,14 @@ static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt)
 	caif_assert(layr->ctrlcmd != NULL);
 
 	if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
-		pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
+		pr_err("Packet is erroneous!\n");
 		cfpkt_destroy(pkt);
 		return -EPROTO;
 	}
 
 	if ((cmd & DGM_CMD_BIT) == 0) {
 		if (cfpkt_extr_head(pkt, &dgmhdr, 3) < 0) {
-			pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
+			pr_err("Packet is erroneous!\n");
 			cfpkt_destroy(pkt);
 			return -EPROTO;
 		}
@@ -74,8 +77,7 @@ static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt)
 		return 0;
 	default:
 		cfpkt_destroy(pkt);
-		pr_info("CAIF: %s(): Unknown datagram control %d (0x%x)\n",
-			__func__, cmd, cmd);
+		pr_info("Unknown datagram control %d (0x%x)\n", cmd, cmd);
 		return -EPROTO;
 	}
 }
@@ -89,6 +91,10 @@ static int cfdgml_transmit(struct cflayer *layr, struct cfpkt *pkt)
 	if (!cfsrvl_ready(service, &ret))
 		return ret;
 
+	/* STE Modem cannot handle more than 1500 bytes datagrams */
+	if (cfpkt_getlen(pkt) > DGM_MTU)
+		return -EMSGSIZE;
+
 	cfpkt_add_head(pkt, &zero, 4);
 
 	/* Add info for MUX-layer to route the packet out. */
diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c
index e86a4ca3b21..a445043931a 100644
--- a/net/caif/cffrml.c
+++ b/net/caif/cffrml.c
@@ -6,6 +6,8 @@
  * License terms: GNU General Public License (GPL) version 2
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
 #include <linux/stddef.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
@@ -32,7 +34,7 @@ struct cflayer *cffrml_create(u16 phyid, bool use_fcs)
 {
 	struct cffrml *this = kmalloc(sizeof(struct cffrml), GFP_ATOMIC);
 	if (!this) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return NULL;
 	}
 	caif_assert(offsetof(struct cffrml, layer) == 0);
@@ -83,7 +85,7 @@ static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt)
 
 	if (cfpkt_setlen(pkt, len) < 0) {
 		++cffrml_rcv_error;
-		pr_err("CAIF: %s():Framing length error (%d)\n", __func__, len);
+		pr_err("Framing length error (%d)\n", len);
 		cfpkt_destroy(pkt);
 		return -EPROTO;
 	}
@@ -99,14 +101,14 @@ static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt)
 			cfpkt_add_trail(pkt, &tmp, 2);
 			++cffrml_rcv_error;
 			++cffrml_rcv_checsum_error;
-			pr_info("CAIF: %s(): Frame checksum error "
-				"(0x%x != 0x%x)\n", __func__, hdrchks, pktchks);
+			pr_info("Frame checksum error (0x%x != 0x%x)\n",
+				hdrchks, pktchks);
 			return -EILSEQ;
 		}
 	}
 	if (cfpkt_erroneous(pkt)) {
 		++cffrml_rcv_error;
-		pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
+		pr_err("Packet is erroneous!\n");
 		cfpkt_destroy(pkt);
 		return -EPROTO;
 	}
@@ -132,7 +134,7 @@ static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt)
 	cfpkt_add_head(pkt, &tmp, 2);
 	cfpkt_info(pkt)->hdr_len += 2;
 	if (cfpkt_erroneous(pkt)) {
-		pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
+		pr_err("Packet is erroneous!\n");
 		return -EPROTO;
 	}
 	ret = layr->dn->transmit(layr->dn, pkt);
diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c
index 80c8d332b25..46f34b2e047 100644
--- a/net/caif/cfmuxl.c
+++ b/net/caif/cfmuxl.c
@@ -3,6 +3,9 @@
  * Author:	Sjur Brendeland/sjur.brandeland@stericsson.com
  * License terms: GNU General Public License (GPL) version 2
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
 #include <linux/stddef.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
@@ -190,7 +193,7 @@ static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt)
 	u8 id;
 	struct cflayer *up;
 	if (cfpkt_extr_head(pkt, &id, 1) < 0) {
-		pr_err("CAIF: %s(): erroneous Caif Packet\n", __func__);
+		pr_err("erroneous Caif Packet\n");
 		cfpkt_destroy(pkt);
 		return -EPROTO;
 	}
@@ -199,8 +202,8 @@ static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt)
 	up = get_up(muxl, id);
 	spin_unlock(&muxl->receive_lock);
 	if (up == NULL) {
-		pr_info("CAIF: %s():Received data on unknown link ID = %d "
-			"(0x%x)	 up == NULL", __func__, id, id);
+		pr_info("Received data on unknown link ID = %d (0x%x) up == NULL",
+			id, id);
 		cfpkt_destroy(pkt);
 		/*
 		 * Don't return ERROR, since modem misbehaves and sends out
@@ -223,9 +226,8 @@ static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt)
 	struct caif_payload_info *info = cfpkt_info(pkt);
 	dn = get_dn(muxl, cfpkt_info(pkt)->dev_info);
 	if (dn == NULL) {
-		pr_warning("CAIF: %s(): Send data on unknown phy "
-			   "ID = %d (0x%x)\n",
-			   __func__, info->dev_info->id, info->dev_info->id);
+		pr_warn("Send data on unknown phy ID = %d (0x%x)\n",
+			info->dev_info->id, info->dev_info->id);
 		return -ENOTCONN;
 	}
 	info->hdr_len += 1;
diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c
index a6fdf899741..d7e865e2ff6 100644
--- a/net/caif/cfpkt_skbuff.c
+++ b/net/caif/cfpkt_skbuff.c
@@ -4,19 +4,22 @@
  * License terms: GNU General Public License (GPL) version 2
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
 #include <linux/string.h>
 #include <linux/skbuff.h>
 #include <linux/hardirq.h>
 #include <net/caif/cfpkt.h>
 
-#define PKT_PREFIX CAIF_NEEDED_HEADROOM
-#define PKT_POSTFIX CAIF_NEEDED_TAILROOM
+#define PKT_PREFIX  48
+#define PKT_POSTFIX 2
 #define PKT_LEN_WHEN_EXTENDING 128
-#define PKT_ERROR(pkt, errmsg) do {	   \
-    cfpkt_priv(pkt)->erronous = true;	   \
-    skb_reset_tail_pointer(&pkt->skb);	   \
-    pr_warning("CAIF: " errmsg);\
-  } while (0)
+#define PKT_ERROR(pkt, errmsg)		   \
+do {					   \
+	cfpkt_priv(pkt)->erronous = true;  \
+	skb_reset_tail_pointer(&pkt->skb); \
+	pr_warn(errmsg);		   \
+} while (0)
 
 struct cfpktq {
 	struct sk_buff_head head;
@@ -130,13 +133,13 @@ int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len)
 		return -EPROTO;
 
 	if (unlikely(len > skb->len)) {
-		PKT_ERROR(pkt, "cfpkt_extr_head read beyond end of packet\n");
+		PKT_ERROR(pkt, "read beyond end of packet\n");
 		return -EPROTO;
 	}
 
 	if (unlikely(len > skb_headlen(skb))) {
 		if (unlikely(skb_linearize(skb) != 0)) {
-			PKT_ERROR(pkt, "cfpkt_extr_head linearize failed\n");
+			PKT_ERROR(pkt, "linearize failed\n");
 			return -EPROTO;
 		}
 	}
@@ -156,11 +159,11 @@ int cfpkt_extr_trail(struct cfpkt *pkt, void *dta, u16 len)
 		return -EPROTO;
 
 	if (unlikely(skb_linearize(skb) != 0)) {
-		PKT_ERROR(pkt, "cfpkt_extr_trail linearize failed\n");
+		PKT_ERROR(pkt, "linearize failed\n");
 		return -EPROTO;
 	}
 	if (unlikely(skb->data + len > skb_tail_pointer(skb))) {
-		PKT_ERROR(pkt, "cfpkt_extr_trail read beyond end of packet\n");
+		PKT_ERROR(pkt, "read beyond end of packet\n");
 		return -EPROTO;
 	}
 	from = skb_tail_pointer(skb) - len;
@@ -202,7 +205,7 @@ int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len)
 
 		/* Make sure data is writable */
 		if (unlikely(skb_cow_data(skb, addlen, &lastskb) < 0)) {
-			PKT_ERROR(pkt, "cfpkt_add_body: cow failed\n");
+			PKT_ERROR(pkt, "cow failed\n");
 			return -EPROTO;
 		}
 		/*
@@ -211,8 +214,7 @@ int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len)
 		 * lengths of the top SKB.
 		 */
 		if (lastskb != skb) {
-			pr_warning("CAIF: %s(): Packet is non-linear\n",
-				   __func__);
+			pr_warn("Packet is non-linear\n");
 			skb->len += len;
 			skb->data_len += len;
 		}
@@ -242,14 +244,14 @@ int cfpkt_add_head(struct cfpkt *pkt, const void *data2, u16 len)
 	if (unlikely(is_erronous(pkt)))
 		return -EPROTO;
 	if (unlikely(skb_headroom(skb) < len)) {
-		PKT_ERROR(pkt, "cfpkt_add_head: no headroom\n");
+		PKT_ERROR(pkt, "no headroom\n");
 		return -EPROTO;
 	}
 
 	/* Make sure data is writable */
 	ret = skb_cow_data(skb, 0, &lastskb);
 	if (unlikely(ret < 0)) {
-		PKT_ERROR(pkt, "cfpkt_add_head: cow failed\n");
+		PKT_ERROR(pkt, "cow failed\n");
 		return ret;
 	}
 
@@ -283,7 +285,7 @@ inline u16 cfpkt_iterate(struct cfpkt *pkt,
 	if (unlikely(is_erronous(pkt)))
 		return -EPROTO;
 	if (unlikely(skb_linearize(&pkt->skb) != 0)) {
-		PKT_ERROR(pkt, "cfpkt_iterate: linearize failed\n");
+		PKT_ERROR(pkt, "linearize failed\n");
 		return -EPROTO;
 	}
 	return iter_func(data, pkt->skb.data, cfpkt_getlen(pkt));
@@ -309,7 +311,7 @@ int cfpkt_setlen(struct cfpkt *pkt, u16 len)
 
 	/* Need to expand SKB */
 	if (unlikely(!cfpkt_pad_trail(pkt, len - skb->len)))
-		PKT_ERROR(pkt, "cfpkt_setlen: skb_pad_trail failed\n");
+		PKT_ERROR(pkt, "skb_pad_trail failed\n");
 
 	return cfpkt_getlen(pkt);
 }
@@ -338,7 +340,6 @@ struct cfpkt *cfpkt_append(struct cfpkt *dstpkt,
 	u16 dstlen;
 	u16 createlen;
 	if (unlikely(is_erronous(dstpkt) || is_erronous(addpkt))) {
-		cfpkt_destroy(addpkt);
 		return dstpkt;
 	}
 	if (expectlen > addlen)
@@ -381,8 +382,7 @@ struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos)
 		return NULL;
 
 	if (skb->data + pos > skb_tail_pointer(skb)) {
-		PKT_ERROR(pkt,
-			  "cfpkt_split: trying to split beyond end of packet");
+		PKT_ERROR(pkt, "trying to split beyond end of packet\n");
 		return NULL;
 	}
 
@@ -456,17 +456,17 @@ int cfpkt_raw_append(struct cfpkt *pkt, void **buf, unsigned int buflen)
 		return -EPROTO;
 	/* Make sure SKB is writable */
 	if (unlikely(skb_cow_data(skb, 0, &lastskb) < 0)) {
-		PKT_ERROR(pkt, "cfpkt_raw_append: skb_cow_data failed\n");
+		PKT_ERROR(pkt, "skb_cow_data failed\n");
 		return -EPROTO;
 	}
 
 	if (unlikely(skb_linearize(skb) != 0)) {
-		PKT_ERROR(pkt, "cfpkt_raw_append: linearize failed\n");
+		PKT_ERROR(pkt, "linearize failed\n");
 		return -EPROTO;
 	}
 
 	if (unlikely(skb_tailroom(skb) < buflen)) {
-		PKT_ERROR(pkt, "cfpkt_raw_append: buffer too short - failed\n");
+		PKT_ERROR(pkt, "buffer too short - failed\n");
 		return -EPROTO;
 	}
 
@@ -484,14 +484,13 @@ int cfpkt_raw_extract(struct cfpkt *pkt, void **buf, unsigned int buflen)
 		return -EPROTO;
 
 	if (unlikely(buflen > skb->len)) {
-		PKT_ERROR(pkt, "cfpkt_raw_extract: buflen too large "
-				"- failed\n");
+		PKT_ERROR(pkt, "buflen too large - failed\n");
 		return -EPROTO;
 	}
 
 	if (unlikely(buflen > skb_headlen(skb))) {
 		if (unlikely(skb_linearize(skb) != 0)) {
-			PKT_ERROR(pkt, "cfpkt_raw_extract: linearize failed\n");
+			PKT_ERROR(pkt, "linearize failed\n");
 			return -EPROTO;
 		}
 	}
diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c
index fd27b172fb5..bde8481e8d2 100644
--- a/net/caif/cfrfml.c
+++ b/net/caif/cfrfml.c
@@ -4,105 +4,305 @@
  * License terms: GNU General Public License (GPL) version 2
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
 #include <linux/stddef.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
+#include <asm/unaligned.h>
 #include <net/caif/caif_layer.h>
 #include <net/caif/cfsrvl.h>
 #include <net/caif/cfpkt.h>
 
-#define container_obj(layr) container_of(layr, struct cfsrvl, layer)
-
+#define container_obj(layr) container_of(layr, struct cfrfml, serv.layer)
 #define RFM_SEGMENTATION_BIT 0x01
-#define RFM_PAYLOAD  0x00
-#define RFM_CMD_BIT  0x80
-#define RFM_FLOW_OFF 0x81
-#define RFM_FLOW_ON  0x80
-#define RFM_SET_PIN  0x82
-#define RFM_CTRL_PKT_SIZE 1
+#define RFM_HEAD_SIZE 7
 
 static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt);
 static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt);
-static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl);
 
-struct cflayer *cfrfml_create(u8 channel_id, struct dev_info *dev_info)
+struct cfrfml {
+	struct cfsrvl serv;
+	struct cfpkt *incomplete_frm;
+	int fragment_size;
+	u8  seghead[6];
+	u16 pdu_size;
+	/* Protects serialized processing of packets */
+	spinlock_t sync;
+};
+
+static void cfrfml_release(struct kref *kref)
+{
+	struct cfsrvl *srvl = container_of(kref, struct cfsrvl, ref);
+	struct cfrfml *rfml = container_obj(&srvl->layer);
+
+	if (rfml->incomplete_frm)
+		cfpkt_destroy(rfml->incomplete_frm);
+
+	kfree(srvl);
+}
+
+struct cflayer *cfrfml_create(u8 channel_id, struct dev_info *dev_info,
+					int mtu_size)
 {
-	struct cfsrvl *rfm = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
-	if (!rfm) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+	int tmp;
+	struct cfrfml *this =
+		kzalloc(sizeof(struct cfrfml), GFP_ATOMIC);
+
+	if (!this) {
+		pr_warn("Out of memory\n");
 		return NULL;
 	}
-	caif_assert(offsetof(struct cfsrvl, layer) == 0);
-	memset(rfm, 0, sizeof(struct cfsrvl));
-	cfsrvl_init(rfm, channel_id, dev_info);
-	rfm->layer.modemcmd = cfservl_modemcmd;
-	rfm->layer.receive = cfrfml_receive;
-	rfm->layer.transmit = cfrfml_transmit;
-	snprintf(rfm->layer.name, CAIF_LAYER_NAME_SZ, "rfm%d", channel_id);
-	return &rfm->layer;
+
+	cfsrvl_init(&this->serv, channel_id, dev_info, false);
+	this->serv.release = cfrfml_release;
+	this->serv.layer.receive = cfrfml_receive;
+	this->serv.layer.transmit = cfrfml_transmit;
+
+	/* Round down to closest multiple of 16 */
+	tmp = (mtu_size - RFM_HEAD_SIZE - 6) / 16;
+	tmp *= 16;
+
+	this->fragment_size = tmp;
+	spin_lock_init(&this->sync);
+	snprintf(this->serv.layer.name, CAIF_LAYER_NAME_SZ,
+		"rfm%d", channel_id);
+
+	return &this->serv.layer;
 }
 
-static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl)
+static struct cfpkt *rfm_append(struct cfrfml *rfml, char *seghead,
+			struct cfpkt *pkt, int *err)
 {
-       return -EPROTO;
+	struct cfpkt *tmppkt;
+	*err = -EPROTO;
+	/* n-th but not last segment */
+
+	if (cfpkt_extr_head(pkt, seghead, 6) < 0)
+		return NULL;
+
+	/* Verify correct header */
+	if (memcmp(seghead, rfml->seghead, 6) != 0)
+		return NULL;
+
+	tmppkt = cfpkt_append(rfml->incomplete_frm, pkt,
+			rfml->pdu_size + RFM_HEAD_SIZE);
+
+	/* If cfpkt_append failes input pkts are not freed */
+	*err = -ENOMEM;
+	if (tmppkt == NULL)
+		return NULL;
+
+	*err = 0;
+	return tmppkt;
 }
 
 static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt)
 {
 	u8 tmp;
 	bool segmented;
-	int ret;
+	int err;
+	u8 seghead[6];
+	struct cfrfml *rfml;
+	struct cfpkt *tmppkt = NULL;
+
 	caif_assert(layr->up != NULL);
 	caif_assert(layr->receive != NULL);
+	rfml = container_obj(layr);
+	spin_lock(&rfml->sync);
+
+	err = -EPROTO;
+	if (cfpkt_extr_head(pkt, &tmp, 1) < 0)
+		goto out;
+	segmented = tmp & RFM_SEGMENTATION_BIT;
+
+	if (segmented) {
+		if (rfml->incomplete_frm == NULL) {
+			/* Initial Segment */
+			if (cfpkt_peek_head(pkt, rfml->seghead, 6) < 0)
+				goto out;
+
+			rfml->pdu_size = get_unaligned_le16(rfml->seghead+4);
+
+			if (cfpkt_erroneous(pkt))
+				goto out;
+			rfml->incomplete_frm = pkt;
+			pkt = NULL;
+		} else {
+
+			tmppkt = rfm_append(rfml, seghead, pkt, &err);
+			if (tmppkt == NULL)
+				goto out;
+
+			if (cfpkt_erroneous(tmppkt))
+				goto out;
+
+			rfml->incomplete_frm = tmppkt;
+
+
+			if (cfpkt_erroneous(tmppkt))
+				goto out;
+		}
+		err = 0;
+		goto out;
+	}
+
+	if (rfml->incomplete_frm) {
+
+		/* Last Segment */
+		tmppkt = rfm_append(rfml, seghead, pkt, &err);
+		if (tmppkt == NULL)
+			goto out;
+
+		if (cfpkt_erroneous(tmppkt))
+			goto out;
+
+		rfml->incomplete_frm = NULL;
+		pkt = tmppkt;
+		tmppkt = NULL;
+
+		/* Verify that length is correct */
+		err = EPROTO;
+		if (rfml->pdu_size != cfpkt_getlen(pkt) - RFM_HEAD_SIZE + 1)
+			goto out;
+	}
+
+	err = rfml->serv.layer.up->receive(rfml->serv.layer.up, pkt);
+
+out:
+
+	if (err != 0) {
+		if (tmppkt)
+			cfpkt_destroy(tmppkt);
+		if (pkt)
+			cfpkt_destroy(pkt);
+		if (rfml->incomplete_frm)
+			cfpkt_destroy(rfml->incomplete_frm);
+		rfml->incomplete_frm = NULL;
+
+		pr_info("Connection error %d triggered on RFM link\n", err);
+
+		/* Trigger connection error upon failure.*/
+		layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND,
+					rfml->serv.dev_info.id);
+	}
+	spin_unlock(&rfml->sync);
+	return err;
+}
+
+
+static int cfrfml_transmit_segment(struct cfrfml *rfml, struct cfpkt *pkt)
+{
+	caif_assert(cfpkt_getlen(pkt) >= rfml->fragment_size);
+
+	/* Add info for MUX-layer to route the packet out. */
+	cfpkt_info(pkt)->channel_id = rfml->serv.layer.id;
 
 	/*
-	 * RFM is taking care of segmentation and stripping of
-	 * segmentation bit.
+	 * To optimize alignment, we add up the size of CAIF header before
+	 * payload.
 	 */
-	if (cfpkt_extr_head(pkt, &tmp, 1) < 0) {
-		pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
-		cfpkt_destroy(pkt);
-		return -EPROTO;
-	}
-	segmented = tmp & RFM_SEGMENTATION_BIT;
-	caif_assert(!segmented);
+	cfpkt_info(pkt)->hdr_len = RFM_HEAD_SIZE;
+	cfpkt_info(pkt)->dev_info = &rfml->serv.dev_info;
 
-	ret = layr->up->receive(layr->up, pkt);
-	return ret;
+	return rfml->serv.layer.dn->transmit(rfml->serv.layer.dn, pkt);
 }
 
 static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt)
 {
-	u8 tmp = 0;
-	int ret;
-	struct cfsrvl *service = container_obj(layr);
+	int err;
+	u8 seg;
+	u8 head[6];
+	struct cfpkt *rearpkt = NULL;
+	struct cfpkt *frontpkt = pkt;
+	struct cfrfml *rfml = container_obj(layr);
 
 	caif_assert(layr->dn != NULL);
 	caif_assert(layr->dn->transmit != NULL);
 
-	if (!cfsrvl_ready(service, &ret))
-		return ret;
+	if (!cfsrvl_ready(&rfml->serv, &err))
+		return err;
+
+	err = -EPROTO;
+	if (cfpkt_getlen(pkt) <= RFM_HEAD_SIZE-1)
+		goto out;
+
+	err = 0;
+	if (cfpkt_getlen(pkt) > rfml->fragment_size + RFM_HEAD_SIZE)
+		err = cfpkt_peek_head(pkt, head, 6);
+
+	if (err < 0)
+		goto out;
+
+	while (cfpkt_getlen(frontpkt) > rfml->fragment_size + RFM_HEAD_SIZE) {
+
+		seg = 1;
+		err = -EPROTO;
+
+		if (cfpkt_add_head(frontpkt, &seg, 1) < 0)
+			goto out;
+		/*
+		 * On OOM error cfpkt_split returns NULL.
+		 *
+		 * NOTE: Segmented pdu is not correctly aligned.
+		 * This has negative performance impact.
+		 */
+
+		rearpkt = cfpkt_split(frontpkt, rfml->fragment_size);
+		if (rearpkt == NULL)
+			goto out;
+
+		err = cfrfml_transmit_segment(rfml, frontpkt);
+
+		if (err != 0)
+			goto out;
+		frontpkt = rearpkt;
+		rearpkt = NULL;
+
+		err = -ENOMEM;
+		if (frontpkt == NULL)
+			goto out;
+		err = -EPROTO;
+		if (cfpkt_add_head(frontpkt, head, 6) < 0)
+			goto out;
 
-	if (cfpkt_getlen(pkt) > CAIF_MAX_PAYLOAD_SIZE) {
-		pr_err("CAIF: %s():Packet too large - size=%d\n",
-			__func__, cfpkt_getlen(pkt));
-		return -EOVERFLOW;
 	}
-	if (cfpkt_add_head(pkt, &tmp, 1) < 0) {
-		pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
-		return -EPROTO;
+
+	seg = 0;
+	err = -EPROTO;
+
+	if (cfpkt_add_head(frontpkt, &seg, 1) < 0)
+		goto out;
+
+	err = cfrfml_transmit_segment(rfml, frontpkt);
+
+	frontpkt = NULL;
+out:
+
+	if (err != 0) {
+		pr_info("Connection error %d triggered on RFM link\n", err);
+		/* Trigger connection error upon failure.*/
+
+		layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND,
+					rfml->serv.dev_info.id);
+
+		if (rearpkt)
+			cfpkt_destroy(rearpkt);
+
+		if (frontpkt && frontpkt != pkt) {
+
+			cfpkt_destroy(frontpkt);
+			/*
+			 * Socket layer will free the original packet,
+			 * but this packet may already be sent and
+			 * freed. So we have to return 0 in this case
+			 * to avoid socket layer to re-free this packet.
+			 * The return of shutdown indication will
+			 * cause connection to be invalidated anyhow.
+			 */
+			err = 0;
+		}
 	}
 
-	/* Add info for MUX-layer to route the packet out. */
-	cfpkt_info(pkt)->channel_id = service->layer.id;
-	/*
-	 * To optimize alignment, we add up the size of CAIF header before
-	 * payload.
-	 */
-	cfpkt_info(pkt)->hdr_len = 1;
-	cfpkt_info(pkt)->dev_info = &service->dev_info;
-	ret = layr->dn->transmit(layr->dn, pkt);
-	if (ret < 0)
-		cfpkt_extr_head(pkt, &tmp, 1);
-	return ret;
+	return err;
 }
diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c
index 965c5baace4..9297f7dea9d 100644
--- a/net/caif/cfserl.c
+++ b/net/caif/cfserl.c
@@ -4,6 +4,8 @@
  * License terms: GNU General Public License (GPL) version 2
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
 #include <linux/stddef.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
@@ -14,7 +16,8 @@
 #define container_obj(layr) ((struct cfserl *) layr)
 
 #define CFSERL_STX 0x02
-#define CAIF_MINIUM_PACKET_SIZE 4
+#define SERIAL_MINIUM_PACKET_SIZE 4
+#define SERIAL_MAX_FRAMESIZE 4096
 struct cfserl {
 	struct cflayer layer;
 	struct cfpkt *incomplete_frm;
@@ -33,7 +36,7 @@ struct cflayer *cfserl_create(int type, int instance, bool use_stx)
 {
 	struct cfserl *this = kmalloc(sizeof(struct cfserl), GFP_ATOMIC);
 	if (!this) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return NULL;
 	}
 	caif_assert(offsetof(struct cfserl, layer) == 0);
@@ -119,8 +122,8 @@ static int cfserl_receive(struct cflayer *l, struct cfpkt *newpkt)
 		/*
 		 * Frame error handling
 		 */
-		if (expectlen < CAIF_MINIUM_PACKET_SIZE
-		    || expectlen > CAIF_MAX_FRAMESIZE) {
+		if (expectlen < SERIAL_MINIUM_PACKET_SIZE
+		    || expectlen > SERIAL_MAX_FRAMESIZE) {
 			if (!layr->usestx) {
 				if (pkt != NULL)
 					cfpkt_destroy(pkt);
diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c
index 6e5b7079a68..ab5e542526b 100644
--- a/net/caif/cfsrvl.c
+++ b/net/caif/cfsrvl.c
@@ -4,6 +4,8 @@
  * License terms: GNU General Public License (GPL) version 2
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/errno.h>
@@ -24,8 +26,10 @@ static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
 				int phyid)
 {
 	struct cfsrvl *service = container_obj(layr);
+
 	caif_assert(layr->up != NULL);
 	caif_assert(layr->up->ctrlcmd != NULL);
+
 	switch (ctrl) {
 	case CAIF_CTRLCMD_INIT_RSP:
 		service->open = true;
@@ -77,8 +81,7 @@ static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
 		layr->up->ctrlcmd(layr->up, ctrl, phyid);
 		break;
 	default:
-		pr_warning("CAIF: %s(): "
-			   "Unexpected ctrl in cfsrvl (%d)\n", __func__, ctrl);
+		pr_warn("Unexpected ctrl in cfsrvl (%d)\n", ctrl);
 		/* We have both modem and phy flow on, send flow on */
 		layr->up->ctrlcmd(layr->up, ctrl, phyid);
 		service->phy_flow_on = true;
@@ -89,9 +92,14 @@ static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
 static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl)
 {
 	struct cfsrvl *service = container_obj(layr);
+
 	caif_assert(layr != NULL);
 	caif_assert(layr->dn != NULL);
 	caif_assert(layr->dn->transmit != NULL);
+
+	if (!service->supports_flowctrl)
+		return 0;
+
 	switch (ctrl) {
 	case CAIF_MODEMCMD_FLOW_ON_REQ:
 		{
@@ -100,14 +108,12 @@ static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl)
 			u8 flow_on = SRVL_FLOW_ON;
 			pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE);
 			if (!pkt) {
-				pr_warning("CAIF: %s(): Out of memory\n",
-					__func__);
+				pr_warn("Out of memory\n");
 				return -ENOMEM;
 			}
 
 			if (cfpkt_add_head(pkt, &flow_on, 1) < 0) {
-				pr_err("CAIF: %s(): Packet is erroneous!\n",
-					__func__);
+				pr_err("Packet is erroneous!\n");
 				cfpkt_destroy(pkt);
 				return -EPROTO;
 			}
@@ -124,14 +130,12 @@ static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl)
 			u8 flow_off = SRVL_FLOW_OFF;
 			pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE);
 			if (!pkt) {
-				pr_warning("CAIF: %s(): Out of memory\n",
-					__func__);
+				pr_warn("Out of memory\n");
 				return -ENOMEM;
 			}
 
 			if (cfpkt_add_head(pkt, &flow_off, 1) < 0) {
-				pr_err("CAIF: %s(): Packet is erroneous!\n",
-					__func__);
+				pr_err("Packet is erroneous!\n");
 				cfpkt_destroy(pkt);
 				return -EPROTO;
 			}
@@ -152,9 +156,17 @@ void cfservl_destroy(struct cflayer *layer)
 	kfree(layer);
 }
 
+void cfsrvl_release(struct kref *kref)
+{
+	struct cfsrvl *service = container_of(kref, struct cfsrvl, ref);
+	kfree(service);
+}
+
 void cfsrvl_init(struct cfsrvl *service,
-		 u8 channel_id,
-		 struct dev_info *dev_info)
+			u8 channel_id,
+			struct dev_info *dev_info,
+			bool supports_flowctrl
+			)
 {
 	caif_assert(offsetof(struct cfsrvl, layer) == 0);
 	service->open = false;
@@ -164,14 +176,11 @@ void cfsrvl_init(struct cfsrvl *service,
 	service->layer.ctrlcmd = cfservl_ctrlcmd;
 	service->layer.modemcmd = cfservl_modemcmd;
 	service->dev_info = *dev_info;
+	service->supports_flowctrl = supports_flowctrl;
+	service->release = cfsrvl_release;
 	kref_init(&service->ref);
 }
 
-void cfsrvl_release(struct kref *kref)
-{
-	struct cfsrvl *service = container_of(kref, struct cfsrvl, ref);
-	kfree(service);
-}
 
 bool cfsrvl_ready(struct cfsrvl *service, int *err)
 {
diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c
index 5fd2c9ea8b4..efad410e4c8 100644
--- a/net/caif/cfutill.c
+++ b/net/caif/cfutill.c
@@ -4,6 +4,8 @@
  * License terms: GNU General Public License (GPL) version 2
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/slab.h>
@@ -26,12 +28,12 @@ struct cflayer *cfutill_create(u8 channel_id, struct dev_info *dev_info)
 {
 	struct cfsrvl *util = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
 	if (!util) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return NULL;
 	}
 	caif_assert(offsetof(struct cfsrvl, layer) == 0);
 	memset(util, 0, sizeof(struct cfsrvl));
-	cfsrvl_init(util, channel_id, dev_info);
+	cfsrvl_init(util, channel_id, dev_info, true);
 	util->layer.receive = cfutill_receive;
 	util->layer.transmit = cfutill_transmit;
 	snprintf(util->layer.name, CAIF_LAYER_NAME_SZ - 1, "util1");
@@ -47,7 +49,7 @@ static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt)
 	caif_assert(layr->up->receive != NULL);
 	caif_assert(layr->up->ctrlcmd != NULL);
 	if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
-		pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
+		pr_err("Packet is erroneous!\n");
 		cfpkt_destroy(pkt);
 		return -EPROTO;
 	}
@@ -64,16 +66,14 @@ static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt)
 		cfpkt_destroy(pkt);
 		return 0;
 	case UTIL_REMOTE_SHUTDOWN:	/* Remote Shutdown Request */
-		pr_err("CAIF: %s(): REMOTE SHUTDOWN REQUEST RECEIVED\n",
-			__func__);
+		pr_err("REMOTE SHUTDOWN REQUEST RECEIVED\n");
 		layr->ctrlcmd(layr, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, 0);
 		service->open = false;
 		cfpkt_destroy(pkt);
 		return 0;
 	default:
 		cfpkt_destroy(pkt);
-		pr_warning("CAIF: %s(): Unknown service control %d (0x%x)\n",
-			   __func__, cmd, cmd);
+		pr_warn("Unknown service control %d (0x%x)\n", cmd, cmd);
 		return -EPROTO;
 	}
 }
@@ -90,12 +90,6 @@ static int cfutill_transmit(struct cflayer *layr, struct cfpkt *pkt)
 	if (!cfsrvl_ready(service, &ret))
 		return ret;
 
-	if (cfpkt_getlen(pkt) > CAIF_MAX_PAYLOAD_SIZE) {
-		pr_err("CAIF: %s(): packet too large size=%d\n",
-			__func__, cfpkt_getlen(pkt));
-		return -EOVERFLOW;
-	}
-
 	cfpkt_add_head(pkt, &zero, 1);
 	/* Add info for MUX-layer to route the packet out. */
 	info = cfpkt_info(pkt);
diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c
index e04f7d964e8..3b425b189a9 100644
--- a/net/caif/cfveil.c
+++ b/net/caif/cfveil.c
@@ -4,6 +4,8 @@
  * License terms: GNU General Public License (GPL) version 2
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
 #include <linux/stddef.h>
 #include <linux/slab.h>
 #include <net/caif/caif_layer.h>
@@ -25,12 +27,12 @@ struct cflayer *cfvei_create(u8 channel_id, struct dev_info *dev_info)
 {
 	struct cfsrvl *vei = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
 	if (!vei) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return NULL;
 	}
 	caif_assert(offsetof(struct cfsrvl, layer) == 0);
 	memset(vei, 0, sizeof(struct cfsrvl));
-	cfsrvl_init(vei, channel_id, dev_info);
+	cfsrvl_init(vei, channel_id, dev_info, true);
 	vei->layer.receive = cfvei_receive;
 	vei->layer.transmit = cfvei_transmit;
 	snprintf(vei->layer.name, CAIF_LAYER_NAME_SZ - 1, "vei%d", channel_id);
@@ -47,7 +49,7 @@ static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt)
 
 
 	if (cfpkt_extr_head(pkt, &cmd, 1) < 0) {
-		pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
+		pr_err("Packet is erroneous!\n");
 		cfpkt_destroy(pkt);
 		return -EPROTO;
 	}
@@ -67,8 +69,7 @@ static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt)
 		cfpkt_destroy(pkt);
 		return 0;
 	default:		/* SET RS232 PIN */
-		pr_warning("CAIF: %s():Unknown VEI control packet %d (0x%x)!\n",
-			   __func__, cmd, cmd);
+		pr_warn("Unknown VEI control packet %d (0x%x)!\n", cmd, cmd);
 		cfpkt_destroy(pkt);
 		return -EPROTO;
 	}
@@ -84,14 +85,9 @@ static int cfvei_transmit(struct cflayer *layr, struct cfpkt *pkt)
 		return ret;
 	caif_assert(layr->dn != NULL);
 	caif_assert(layr->dn->transmit != NULL);
-	if (cfpkt_getlen(pkt) > CAIF_MAX_PAYLOAD_SIZE) {
-		pr_warning("CAIF: %s(): Packet too large - size=%d\n",
-			   __func__, cfpkt_getlen(pkt));
-		return -EOVERFLOW;
-	}
 
 	if (cfpkt_add_head(pkt, &tmp, 1) < 0) {
-		pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
+		pr_err("Packet is erroneous!\n");
 		return -EPROTO;
 	}
 
diff --git a/net/caif/cfvidl.c b/net/caif/cfvidl.c
index 89ad4ea239f..bf6fef2a0ef 100644
--- a/net/caif/cfvidl.c
+++ b/net/caif/cfvidl.c
@@ -4,6 +4,8 @@
  * License terms: GNU General Public License (GPL) version 2
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/slab.h>
@@ -21,13 +23,13 @@ struct cflayer *cfvidl_create(u8 channel_id, struct dev_info *dev_info)
 {
 	struct cfsrvl *vid = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC);
 	if (!vid) {
-		pr_warning("CAIF: %s(): Out of memory\n", __func__);
+		pr_warn("Out of memory\n");
 		return NULL;
 	}
 	caif_assert(offsetof(struct cfsrvl, layer) == 0);
 
 	memset(vid, 0, sizeof(struct cfsrvl));
-	cfsrvl_init(vid, channel_id, dev_info);
+	cfsrvl_init(vid, channel_id, dev_info, false);
 	vid->layer.receive = cfvidl_receive;
 	vid->layer.transmit = cfvidl_transmit;
 	snprintf(vid->layer.name, CAIF_LAYER_NAME_SZ - 1, "vid1");
@@ -38,7 +40,7 @@ static int cfvidl_receive(struct cflayer *layr, struct cfpkt *pkt)
 {
 	u32 videoheader;
 	if (cfpkt_extr_head(pkt, &videoheader, 4) < 0) {
-		pr_err("CAIF: %s(): Packet is erroneous!\n", __func__);
+		pr_err("Packet is erroneous!\n");
 		cfpkt_destroy(pkt);
 		return -EPROTO;
 	}
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 610966abe2d..84a422c9894 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -5,6 +5,8 @@
  * License terms: GNU General Public License (GPL) version 2
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+
 #include <linux/version.h>
 #include <linux/fs.h>
 #include <linux/init.h>
@@ -23,14 +25,11 @@
 #include <net/caif/caif_dev.h>
 
 /* GPRS PDP connection has MTU to 1500 */
-#define SIZE_MTU 1500
+#define GPRS_PDP_MTU 1500
 /* 5 sec. connect timeout */
 #define CONNECT_TIMEOUT (5 * HZ)
 #define CAIF_NET_DEFAULT_QUEUE_LEN 500
 
-#undef pr_debug
-#define pr_debug pr_warning
-
 /*This list is protected by the rtnl lock. */
 static LIST_HEAD(chnl_net_list);
 
@@ -142,8 +141,7 @@ static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow,
 				int phyid)
 {
 	struct chnl_net *priv = container_of(layr, struct chnl_net, chnl);
-	pr_debug("CAIF: %s(): NET flowctrl func called flow: %s\n",
-		__func__,
+	pr_debug("NET flowctrl func called flow: %s\n",
 		flow == CAIF_CTRLCMD_FLOW_ON_IND ? "ON" :
 		flow == CAIF_CTRLCMD_INIT_RSP ? "INIT" :
 		flow == CAIF_CTRLCMD_FLOW_OFF_IND ? "OFF" :
@@ -196,12 +194,12 @@ static int chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	priv = netdev_priv(dev);
 
 	if (skb->len > priv->netdev->mtu) {
-		pr_warning("CAIF: %s(): Size of skb exceeded MTU\n", __func__);
+		pr_warn("Size of skb exceeded MTU\n");
 		return -ENOSPC;
 	}
 
 	if (!priv->flowenabled) {
-		pr_debug("CAIF: %s(): dropping packets flow off\n", __func__);
+		pr_debug("dropping packets flow off\n");
 		return NETDEV_TX_BUSY;
 	}
 
@@ -232,50 +230,95 @@ static int chnl_net_open(struct net_device *dev)
 {
 	struct chnl_net *priv = NULL;
 	int result = -1;
+	int llifindex, headroom, tailroom, mtu;
+	struct net_device *lldev;
 	ASSERT_RTNL();
 	priv = netdev_priv(dev);
 	if (!priv) {
-		pr_debug("CAIF: %s(): chnl_net_open: no priv\n", __func__);
+		pr_debug("chnl_net_open: no priv\n");
 		return -ENODEV;
 	}
 
 	if (priv->state != CAIF_CONNECTING) {
 		priv->state = CAIF_CONNECTING;
-		result = caif_connect_client(&priv->conn_req, &priv->chnl);
+		result = caif_connect_client(&priv->conn_req, &priv->chnl,
+					&llifindex, &headroom, &tailroom);
 		if (result != 0) {
-				priv->state = CAIF_DISCONNECTED;
-				pr_debug("CAIF: %s(): err: "
-					"Unable to register and open device,"
-					" Err:%d\n",
-					__func__,
-					result);
-				return result;
+				pr_debug("err: "
+					 "Unable to register and open device,"
+					 " Err:%d\n",
+					 result);
+				goto error;
+		}
+
+		lldev = dev_get_by_index(dev_net(dev), llifindex);
+
+		if (lldev == NULL) {
+			pr_debug("no interface?\n");
+			result = -ENODEV;
+			goto error;
+		}
+
+		dev->needed_tailroom = tailroom + lldev->needed_tailroom;
+		dev->hard_header_len = headroom + lldev->hard_header_len +
+			lldev->needed_tailroom;
+
+		/*
+		 * MTU, head-room etc is not know before we have a
+		 * CAIF link layer device available. MTU calculation may
+		 * override initial RTNL configuration.
+		 * MTU is minimum of current mtu, link layer mtu pluss
+		 * CAIF head and tail, and PDP GPRS contexts max MTU.
+		 */
+		mtu = min_t(int, dev->mtu, lldev->mtu - (headroom + tailroom));
+		mtu = min_t(int, GPRS_PDP_MTU, mtu);
+		dev_set_mtu(dev, mtu);
+		dev_put(lldev);
+
+		if (mtu < 100) {
+			pr_warn("CAIF Interface MTU too small (%d)\n", mtu);
+			result = -ENODEV;
+			goto error;
 		}
 	}
 
+	rtnl_unlock();  /* Release RTNL lock during connect wait */
+
 	result = wait_event_interruptible_timeout(priv->netmgmt_wq,
 						priv->state != CAIF_CONNECTING,
 						CONNECT_TIMEOUT);
 
+	rtnl_lock();
+
 	if (result == -ERESTARTSYS) {
-		pr_debug("CAIF: %s(): wait_event_interruptible"
-			 " woken by a signal\n", __func__);
-		return -ERESTARTSYS;
+		pr_debug("wait_event_interruptible woken by a signal\n");
+		result = -ERESTARTSYS;
+		goto error;
 	}
+
 	if (result == 0) {
-		pr_debug("CAIF: %s(): connect timeout\n", __func__);
+		pr_debug("connect timeout\n");
 		caif_disconnect_client(&priv->chnl);
 		priv->state = CAIF_DISCONNECTED;
-		pr_debug("CAIF: %s(): state disconnected\n", __func__);
-		return -ETIMEDOUT;
+		pr_debug("state disconnected\n");
+		result = -ETIMEDOUT;
+		goto error;
 	}
 
 	if (priv->state != CAIF_CONNECTED) {
-		pr_debug("CAIF: %s(): connect failed\n", __func__);
-		return -ECONNREFUSED;
+		pr_debug("connect failed\n");
+		result = -ECONNREFUSED;
+		goto error;
 	}
-	pr_debug("CAIF: %s(): CAIF Netdevice connected\n", __func__);
+	pr_debug("CAIF Netdevice connected\n");
 	return 0;
+
+error:
+	caif_disconnect_client(&priv->chnl);
+	priv->state = CAIF_DISCONNECTED;
+	pr_debug("state disconnected\n");
+	return result;
+
 }
 
 static int chnl_net_stop(struct net_device *dev)
@@ -321,9 +364,7 @@ static void ipcaif_net_setup(struct net_device *dev)
 	dev->destructor = free_netdev;
 	dev->flags |= IFF_NOARP;
 	dev->flags |= IFF_POINTOPOINT;
-	dev->needed_headroom = CAIF_NEEDED_HEADROOM;
-	dev->needed_tailroom = CAIF_NEEDED_TAILROOM;
-	dev->mtu = SIZE_MTU;
+	dev->mtu = GPRS_PDP_MTU;
 	dev->tx_queue_len = CAIF_NET_DEFAULT_QUEUE_LEN;
 
 	priv = netdev_priv(dev);
@@ -366,7 +407,7 @@ static void caif_netlink_parms(struct nlattr *data[],
 				struct caif_connect_request *conn_req)
 {
 	if (!data) {
-		pr_warning("CAIF: %s: no params data found\n", __func__);
+		pr_warn("no params data found\n");
 		return;
 	}
 	if (data[IFLA_CAIF_IPV4_CONNID])
@@ -395,8 +436,7 @@ static int ipcaif_newlink(struct net *src_net, struct net_device *dev,
 
 	ret = register_netdevice(dev);
 	if (ret)
-		pr_warning("CAIF: %s(): device rtml registration failed\n",
-			   __func__);
+		pr_warn("device rtml registration failed\n");
 	return ret;
 }
 
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 9c65e9deb9c..08ffe9e4be2 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -60,6 +60,13 @@
 #include <net/sock.h>
 #include <net/net_namespace.h>
 
+/*
+ * To send multiple CAN frame content within TX_SETUP or to filter
+ * CAN messages with multiplex index within RX_SETUP, the number of
+ * different filters is limited to 256 due to the one byte index value.
+ */
+#define MAX_NFRAMES 256
+
 /* use of last_frames[index].can_dlc */
 #define RX_RECV    0x40 /* received data for this element */
 #define RX_THR     0x80 /* element not been sent due to throttle feature */
@@ -89,16 +96,16 @@ struct bcm_op {
 	struct list_head list;
 	int ifindex;
 	canid_t can_id;
-	int flags;
+	u32 flags;
 	unsigned long frames_abs, frames_filtered;
 	struct timeval ival1, ival2;
 	struct hrtimer timer, thrtimer;
 	struct tasklet_struct tsklet, thrtsklet;
 	ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
 	int rx_ifindex;
-	int count;
-	int nframes;
-	int currframe;
+	u32 count;
+	u32 nframes;
+	u32 currframe;
 	struct can_frame *frames;
 	struct can_frame *last_frames;
 	struct can_frame sframe;
@@ -175,7 +182,7 @@ static int bcm_proc_show(struct seq_file *m, void *v)
 
 		seq_printf(m, "rx_op: %03X %-5s ",
 				op->can_id, bcm_proc_getifname(ifname, op->ifindex));
-		seq_printf(m, "[%d]%c ", op->nframes,
+		seq_printf(m, "[%u]%c ", op->nframes,
 				(op->flags & RX_CHECK_DLC)?'d':' ');
 		if (op->kt_ival1.tv64)
 			seq_printf(m, "timeo=%lld ",
@@ -198,7 +205,7 @@ static int bcm_proc_show(struct seq_file *m, void *v)
 
 	list_for_each_entry(op, &bo->tx_ops, list) {
 
-		seq_printf(m, "tx_op: %03X %s [%d] ",
+		seq_printf(m, "tx_op: %03X %s [%u] ",
 				op->can_id,
 				bcm_proc_getifname(ifname, op->ifindex),
 				op->nframes);
@@ -283,7 +290,7 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
 	struct can_frame *firstframe;
 	struct sockaddr_can *addr;
 	struct sock *sk = op->sk;
-	int datalen = head->nframes * CFSIZ;
+	unsigned int datalen = head->nframes * CFSIZ;
 	int err;
 
 	skb = alloc_skb(sizeof(*head) + datalen, gfp_any());
@@ -468,7 +475,7 @@ rx_changed_settime:
  * bcm_rx_cmp_to_index - (bit)compares the currently received data to formerly
  *                       received data stored in op->last_frames[]
  */
-static void bcm_rx_cmp_to_index(struct bcm_op *op, int index,
+static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index,
 				const struct can_frame *rxdata)
 {
 	/*
@@ -554,7 +561,8 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
 /*
  * bcm_rx_do_flush - helper for bcm_rx_thr_flush
  */
-static inline int bcm_rx_do_flush(struct bcm_op *op, int update, int index)
+static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
+				  unsigned int index)
 {
 	if ((op->last_frames) && (op->last_frames[index].can_dlc & RX_THR)) {
 		if (update)
@@ -575,7 +583,7 @@ static int bcm_rx_thr_flush(struct bcm_op *op, int update)
 	int updated = 0;
 
 	if (op->nframes > 1) {
-		int i;
+		unsigned int i;
 
 		/* for MUX filter we start at index 1 */
 		for (i = 1; i < op->nframes; i++)
@@ -624,7 +632,7 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data)
 {
 	struct bcm_op *op = (struct bcm_op *)data;
 	const struct can_frame *rxframe = (struct can_frame *)skb->data;
-	int i;
+	unsigned int i;
 
 	/* disable timeout */
 	hrtimer_cancel(&op->timer);
@@ -822,14 +830,15 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 {
 	struct bcm_sock *bo = bcm_sk(sk);
 	struct bcm_op *op;
-	int i, err;
+	unsigned int i;
+	int err;
 
 	/* we need a real device to send frames */
 	if (!ifindex)
 		return -ENODEV;
 
-	/* we need at least one can_frame */
-	if (msg_head->nframes < 1)
+	/* check nframes boundaries - we need at least one can_frame */
+	if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES)
 		return -EINVAL;
 
 	/* check the given can_id */
@@ -993,6 +1002,10 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 		msg_head->nframes = 0;
 	}
 
+	/* the first element contains the mux-mask => MAX_NFRAMES + 1  */
+	if (msg_head->nframes > MAX_NFRAMES + 1)
+		return -EINVAL;
+
 	if ((msg_head->flags & RX_RTR_FRAME) &&
 	    ((msg_head->nframes != 1) ||
 	     (!(msg_head->can_id & CAN_RTR_FLAG))))
diff --git a/net/can/raw.c b/net/can/raw.c
index da99cf153b3..e88f610fdb7 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -90,23 +90,39 @@ struct raw_sock {
 	can_err_mask_t err_mask;
 };
 
+/*
+ * Return pointer to store the extra msg flags for raw_recvmsg().
+ * We use the space of one unsigned int beyond the 'struct sockaddr_can'
+ * in skb->cb.
+ */
+static inline unsigned int *raw_flags(struct sk_buff *skb)
+{
+	BUILD_BUG_ON(sizeof(skb->cb) <= (sizeof(struct sockaddr_can) +
+					 sizeof(unsigned int)));
+
+	/* return pointer after struct sockaddr_can */
+	return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]);
+}
+
 static inline struct raw_sock *raw_sk(const struct sock *sk)
 {
 	return (struct raw_sock *)sk;
 }
 
-static void raw_rcv(struct sk_buff *skb, void *data)
+static void raw_rcv(struct sk_buff *oskb, void *data)
 {
 	struct sock *sk = (struct sock *)data;
 	struct raw_sock *ro = raw_sk(sk);
 	struct sockaddr_can *addr;
+	struct sk_buff *skb;
+	unsigned int *pflags;
 
 	/* check the received tx sock reference */
-	if (!ro->recv_own_msgs && skb->sk == sk)
+	if (!ro->recv_own_msgs && oskb->sk == sk)
 		return;
 
 	/* clone the given skb to be able to enqueue it into the rcv queue */
-	skb = skb_clone(skb, GFP_ATOMIC);
+	skb = skb_clone(oskb, GFP_ATOMIC);
 	if (!skb)
 		return;
 
@@ -123,6 +139,14 @@ static void raw_rcv(struct sk_buff *skb, void *data)
 	addr->can_family  = AF_CAN;
 	addr->can_ifindex = skb->dev->ifindex;
 
+	/* add CAN specific message flags for raw_recvmsg() */
+	pflags = raw_flags(skb);
+	*pflags = 0;
+	if (oskb->sk)
+		*pflags |= MSG_DONTROUTE;
+	if (oskb->sk == sk)
+		*pflags |= MSG_CONFIRM;
+
 	if (sock_queue_rcv_skb(sk, skb) < 0)
 		kfree_skb(skb);
 }
@@ -436,14 +460,9 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
 
 		if (count > 1) {
 			/* filter does not fit into dfilter => alloc space */
-			filter = kmalloc(optlen, GFP_KERNEL);
-			if (!filter)
-				return -ENOMEM;
-
-			if (copy_from_user(filter, optval, optlen)) {
-				kfree(filter);
-				return -EFAULT;
-			}
+			filter = memdup_user(optval, optlen);
+			if (IS_ERR(filter))
+				return PTR_ERR(filter);
 		} else if (count == 1) {
 			if (copy_from_user(&sfilter, optval, sizeof(sfilter)))
 				return -EFAULT;
@@ -652,9 +671,13 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock,
 	err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
 	if (err < 0)
 		goto free_skb;
-	err = sock_tx_timestamp(msg, sk, skb_tx(skb));
+	err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
 	if (err < 0)
 		goto free_skb;
+
+	/* to be able to check the received tx sock reference in raw_rcv() */
+	skb_shinfo(skb)->tx_flags |= SKBTX_DRV_NEEDS_SK_REF;
+
 	skb->dev = dev;
 	skb->sk  = sk;
 
@@ -708,6 +731,9 @@ static int raw_recvmsg(struct kiocb *iocb, struct socket *sock,
 		memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
 	}
 
+	/* assign the flags that have been recorded in raw_rcv() */
+	msg->msg_flags |= *(raw_flags(skb));
+
 	skb_free_datagram(sk, skb);
 
 	return size;
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
new file mode 100644
index 00000000000..ad424049b0c
--- /dev/null
+++ b/net/ceph/Kconfig
@@ -0,0 +1,28 @@
+config CEPH_LIB
+        tristate "Ceph core library (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	select LIBCRC32C
+	select CRYPTO_AES
+	select CRYPTO
+	default n
+	help
+	  Choose Y or M here to include cephlib, which provides the
+	  common functionality to both the Ceph filesystem and
+	  to the rados block device (rbd).
+
+	  More information at http://ceph.newdream.net/.
+
+	  If unsure, say N.
+
+config CEPH_LIB_PRETTYDEBUG
+	bool "Include file:line in ceph debug output"
+	depends on CEPH_LIB
+	default n
+	help
+	  If you say Y here, debug output will include a filename and
+	  line to aid debugging.  This increases kernel size and slows
+	  execution slightly when debug call sites are enabled (e.g.,
+	  via CONFIG_DYNAMIC_DEBUG).
+
+	  If unsure, say N.
+
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
new file mode 100644
index 00000000000..aab1cabb803
--- /dev/null
+++ b/net/ceph/Makefile
@@ -0,0 +1,37 @@
+#
+# Makefile for CEPH filesystem.
+#
+
+ifneq ($(KERNELRELEASE),)
+
+obj-$(CONFIG_CEPH_LIB) += libceph.o
+
+libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
+	mon_client.o \
+	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+	debugfs.o \
+	auth.o auth_none.o \
+	crypto.o armor.o \
+	auth_x.o \
+	ceph_fs.o ceph_strings.o ceph_hash.o \
+	pagevec.o
+
+else
+#Otherwise we were called directly from the command
+# line; invoke the kernel build system.
+
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+default: all
+
+all:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules
+
+modules_install:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install
+
+clean:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+
+endif
diff --git a/net/ceph/armor.c b/net/ceph/armor.c
new file mode 100644
index 00000000000..eb2a666b0be
--- /dev/null
+++ b/net/ceph/armor.c
@@ -0,0 +1,103 @@
+
+#include <linux/errno.h>
+
+int ceph_armor(char *dst, const char *src, const char *end);
+int ceph_unarmor(char *dst, const char *src, const char *end);
+
+/*
+ * base64 encode/decode.
+ */
+
+static const char *pem_key =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static int encode_bits(int c)
+{
+	return pem_key[c];
+}
+
+static int decode_bits(char c)
+{
+	if (c >= 'A' && c <= 'Z')
+		return c - 'A';
+	if (c >= 'a' && c <= 'z')
+		return c - 'a' + 26;
+	if (c >= '0' && c <= '9')
+		return c - '0' + 52;
+	if (c == '+')
+		return 62;
+	if (c == '/')
+		return 63;
+	if (c == '=')
+		return 0; /* just non-negative, please */
+	return -EINVAL;
+}
+
+int ceph_armor(char *dst, const char *src, const char *end)
+{
+	int olen = 0;
+	int line = 0;
+
+	while (src < end) {
+		unsigned char a, b, c;
+
+		a = *src++;
+		*dst++ = encode_bits(a >> 2);
+		if (src < end) {
+			b = *src++;
+			*dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
+			if (src < end) {
+				c = *src++;
+				*dst++ = encode_bits(((b & 15) << 2) |
+						     (c >> 6));
+				*dst++ = encode_bits(c & 63);
+			} else {
+				*dst++ = encode_bits((b & 15) << 2);
+				*dst++ = '=';
+			}
+		} else {
+			*dst++ = encode_bits(((a & 3) << 4));
+			*dst++ = '=';
+			*dst++ = '=';
+		}
+		olen += 4;
+		line += 4;
+		if (line == 64) {
+			line = 0;
+			*(dst++) = '\n';
+			olen++;
+		}
+	}
+	return olen;
+}
+
+int ceph_unarmor(char *dst, const char *src, const char *end)
+{
+	int olen = 0;
+
+	while (src < end) {
+		int a, b, c, d;
+
+		if (src < end && src[0] == '\n')
+			src++;
+		if (src + 4 > end)
+			return -EINVAL;
+		a = decode_bits(src[0]);
+		b = decode_bits(src[1]);
+		c = decode_bits(src[2]);
+		d = decode_bits(src[3]);
+		if (a < 0 || b < 0 || c < 0 || d < 0)
+			return -EINVAL;
+
+		*dst++ = (a << 2) | (b >> 4);
+		if (src[2] == '=')
+			return olen + 1;
+		*dst++ = ((b & 15) << 4) | (c >> 2);
+		if (src[3] == '=')
+			return olen + 2;
+		*dst++ = ((c & 3) << 6) | d;
+		olen += 3;
+		src += 4;
+	}
+	return olen;
+}
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
new file mode 100644
index 00000000000..549c1f43e1d
--- /dev/null
+++ b/net/ceph/auth.c
@@ -0,0 +1,259 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+#include "auth_none.h"
+#include "auth_x.h"
+
+
+/*
+ * get protocol handler
+ */
+static u32 supported_protocols[] = {
+	CEPH_AUTH_NONE,
+	CEPH_AUTH_CEPHX
+};
+
+static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+{
+	switch (protocol) {
+	case CEPH_AUTH_NONE:
+		return ceph_auth_none_init(ac);
+	case CEPH_AUTH_CEPHX:
+		return ceph_x_init(ac);
+	default:
+		return -ENOENT;
+	}
+}
+
+/*
+ * setup, teardown.
+ */
+struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
+{
+	struct ceph_auth_client *ac;
+	int ret;
+
+	dout("auth_init name '%s' secret '%s'\n", name, secret);
+
+	ret = -ENOMEM;
+	ac = kzalloc(sizeof(*ac), GFP_NOFS);
+	if (!ac)
+		goto out;
+
+	ac->negotiating = true;
+	if (name)
+		ac->name = name;
+	else
+		ac->name = CEPH_AUTH_NAME_DEFAULT;
+	dout("auth_init name %s secret %s\n", ac->name, secret);
+	ac->secret = secret;
+	return ac;
+
+out:
+	return ERR_PTR(ret);
+}
+
+void ceph_auth_destroy(struct ceph_auth_client *ac)
+{
+	dout("auth_destroy %p\n", ac);
+	if (ac->ops)
+		ac->ops->destroy(ac);
+	kfree(ac);
+}
+
+/*
+ * Reset occurs when reconnecting to the monitor.
+ */
+void ceph_auth_reset(struct ceph_auth_client *ac)
+{
+	dout("auth_reset %p\n", ac);
+	if (ac->ops && !ac->negotiating)
+		ac->ops->reset(ac);
+	ac->negotiating = true;
+}
+
+int ceph_entity_name_encode(const char *name, void **p, void *end)
+{
+	int len = strlen(name);
+
+	if (*p + 2*sizeof(u32) + len > end)
+		return -ERANGE;
+	ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
+	ceph_encode_32(p, len);
+	ceph_encode_copy(p, name, len);
+	return 0;
+}
+
+/*
+ * Initiate protocol negotiation with monitor.  Include entity name
+ * and list supported protocols.
+ */
+int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
+{
+	struct ceph_mon_request_header *monhdr = buf;
+	void *p = monhdr + 1, *end = buf + len, *lenp;
+	int i, num;
+	int ret;
+
+	dout("auth_build_hello\n");
+	monhdr->have_version = 0;
+	monhdr->session_mon = cpu_to_le16(-1);
+	monhdr->session_mon_tid = 0;
+
+	ceph_encode_32(&p, 0);  /* no protocol, yet */
+
+	lenp = p;
+	p += sizeof(u32);
+
+	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+	ceph_encode_8(&p, 1);
+	num = ARRAY_SIZE(supported_protocols);
+	ceph_encode_32(&p, num);
+	ceph_decode_need(&p, end, num * sizeof(u32), bad);
+	for (i = 0; i < num; i++)
+		ceph_encode_32(&p, supported_protocols[i]);
+
+	ret = ceph_entity_name_encode(ac->name, &p, end);
+	if (ret < 0)
+		return ret;
+	ceph_decode_need(&p, end, sizeof(u64), bad);
+	ceph_encode_64(&p, ac->global_id);
+
+	ceph_encode_32(&lenp, p - lenp - sizeof(u32));
+	return p - buf;
+
+bad:
+	return -ERANGE;
+}
+
+static int ceph_build_auth_request(struct ceph_auth_client *ac,
+				   void *msg_buf, size_t msg_len)
+{
+	struct ceph_mon_request_header *monhdr = msg_buf;
+	void *p = monhdr + 1;
+	void *end = msg_buf + msg_len;
+	int ret;
+
+	monhdr->have_version = 0;
+	monhdr->session_mon = cpu_to_le16(-1);
+	monhdr->session_mon_tid = 0;
+
+	ceph_encode_32(&p, ac->protocol);
+
+	ret = ac->ops->build_request(ac, p + sizeof(u32), end);
+	if (ret < 0) {
+		pr_err("error %d building auth method %s request\n", ret,
+		       ac->ops->name);
+		return ret;
+	}
+	dout(" built request %d bytes\n", ret);
+	ceph_encode_32(&p, ret);
+	return p + ret - msg_buf;
+}
+
+/*
+ * Handle auth message from monitor.
+ */
+int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+			   void *buf, size_t len,
+			   void *reply_buf, size_t reply_len)
+{
+	void *p = buf;
+	void *end = buf + len;
+	int protocol;
+	s32 result;
+	u64 global_id;
+	void *payload, *payload_end;
+	int payload_len;
+	char *result_msg;
+	int result_msg_len;
+	int ret = -EINVAL;
+
+	dout("handle_auth_reply %p %p\n", p, end);
+	ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
+	protocol = ceph_decode_32(&p);
+	result = ceph_decode_32(&p);
+	global_id = ceph_decode_64(&p);
+	payload_len = ceph_decode_32(&p);
+	payload = p;
+	p += payload_len;
+	ceph_decode_need(&p, end, sizeof(u32), bad);
+	result_msg_len = ceph_decode_32(&p);
+	result_msg = p;
+	p += result_msg_len;
+	if (p != end)
+		goto bad;
+
+	dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
+	     result_msg, global_id, payload_len);
+
+	payload_end = payload + payload_len;
+
+	if (global_id && ac->global_id != global_id) {
+		dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
+		ac->global_id = global_id;
+	}
+
+	if (ac->negotiating) {
+		/* server does not support our protocols? */
+		if (!protocol && result < 0) {
+			ret = result;
+			goto out;
+		}
+		/* set up (new) protocol handler? */
+		if (ac->protocol && ac->protocol != protocol) {
+			ac->ops->destroy(ac);
+			ac->protocol = 0;
+			ac->ops = NULL;
+		}
+		if (ac->protocol != protocol) {
+			ret = ceph_auth_init_protocol(ac, protocol);
+			if (ret) {
+				pr_err("error %d on auth protocol %d init\n",
+				       ret, protocol);
+				goto out;
+			}
+		}
+
+		ac->negotiating = false;
+	}
+
+	ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+	if (ret == -EAGAIN) {
+		return ceph_build_auth_request(ac, reply_buf, reply_len);
+	} else if (ret) {
+		pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
+		return ret;
+	}
+	return 0;
+
+bad:
+	pr_err("failed to decode auth msg\n");
+out:
+	return ret;
+}
+
+int ceph_build_auth(struct ceph_auth_client *ac,
+		    void *msg_buf, size_t msg_len)
+{
+	if (!ac->protocol)
+		return ceph_auth_build_hello(ac, msg_buf, msg_len);
+	BUG_ON(!ac->ops);
+	if (ac->ops->should_authenticate(ac))
+		return ceph_build_auth_request(ac, msg_buf, msg_len);
+	return 0;
+}
+
+int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
+{
+	if (!ac->ops)
+		return 0;
+	return ac->ops->is_authenticated(ac);
+}
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
new file mode 100644
index 00000000000..214c2bb43d6
--- /dev/null
+++ b/net/ceph/auth_none.c
@@ -0,0 +1,132 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
+#include "auth_none.h"
+
+static void reset(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	xi->starting = true;
+	xi->built_authorizer = false;
+}
+
+static void destroy(struct ceph_auth_client *ac)
+{
+	kfree(ac->private);
+	ac->private = NULL;
+}
+
+static int is_authenticated(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	return !xi->starting;
+}
+
+static int should_authenticate(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	return xi->starting;
+}
+
+/*
+ * the generic auth code decode the global_id, and we carry no actual
+ * authenticate state, so nothing happens here.
+ */
+static int handle_reply(struct ceph_auth_client *ac, int result,
+			void *buf, void *end)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	xi->starting = false;
+	return result;
+}
+
+/*
+ * build an 'authorizer' with our entity_name and global_id.  we can
+ * reuse a single static copy since it is identical for all services
+ * we connect to.
+ */
+static int ceph_auth_none_create_authorizer(
+	struct ceph_auth_client *ac, int peer_type,
+	struct ceph_authorizer **a,
+	void **buf, size_t *len,
+	void **reply_buf, size_t *reply_len)
+{
+	struct ceph_auth_none_info *ai = ac->private;
+	struct ceph_none_authorizer *au = &ai->au;
+	void *p, *end;
+	int ret;
+
+	if (!ai->built_authorizer) {
+		p = au->buf;
+		end = p + sizeof(au->buf);
+		ceph_encode_8(&p, 1);
+		ret = ceph_entity_name_encode(ac->name, &p, end - 8);
+		if (ret < 0)
+			goto bad;
+		ceph_decode_need(&p, end, sizeof(u64), bad2);
+		ceph_encode_64(&p, ac->global_id);
+		au->buf_len = p - (void *)au->buf;
+		ai->built_authorizer = true;
+		dout("built authorizer len %d\n", au->buf_len);
+	}
+
+	*a = (struct ceph_authorizer *)au;
+	*buf = au->buf;
+	*len = au->buf_len;
+	*reply_buf = au->reply_buf;
+	*reply_len = sizeof(au->reply_buf);
+	return 0;
+
+bad2:
+	ret = -ERANGE;
+bad:
+	return ret;
+}
+
+static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a)
+{
+	/* nothing to do */
+}
+
+static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+	.name = "none",
+	.reset = reset,
+	.destroy = destroy,
+	.is_authenticated = is_authenticated,
+	.should_authenticate = should_authenticate,
+	.handle_reply = handle_reply,
+	.create_authorizer = ceph_auth_none_create_authorizer,
+	.destroy_authorizer = ceph_auth_none_destroy_authorizer,
+};
+
+int ceph_auth_none_init(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi;
+
+	dout("ceph_auth_none_init %p\n", ac);
+	xi = kzalloc(sizeof(*xi), GFP_NOFS);
+	if (!xi)
+		return -ENOMEM;
+
+	xi->starting = true;
+	xi->built_authorizer = false;
+
+	ac->protocol = CEPH_AUTH_NONE;
+	ac->private = xi;
+	ac->ops = &ceph_auth_none_ops;
+	return 0;
+}
+
diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h
new file mode 100644
index 00000000000..ed7d088b1bc
--- /dev/null
+++ b/net/ceph/auth_none.h
@@ -0,0 +1,29 @@
+#ifndef _FS_CEPH_AUTH_NONE_H
+#define _FS_CEPH_AUTH_NONE_H
+
+#include <linux/slab.h>
+#include <linux/ceph/auth.h>
+
+/*
+ * null security mode.
+ *
+ * we use a single static authorizer that simply encodes our entity name
+ * and global id.
+ */
+
+struct ceph_none_authorizer {
+	char buf[128];
+	int buf_len;
+	char reply_buf[0];
+};
+
+struct ceph_auth_none_info {
+	bool starting;
+	bool built_authorizer;
+	struct ceph_none_authorizer au;   /* we only need one; it's static */
+};
+
+extern int ceph_auth_none_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
new file mode 100644
index 00000000000..7fd5dfcf6e1
--- /dev/null
+++ b/net/ceph/auth_x.c
@@ -0,0 +1,688 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
+#include "crypto.h"
+#include "auth_x.h"
+#include "auth_x_protocol.h"
+
+#define TEMP_TICKET_BUF_LEN	256
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
+
+static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+	int need;
+
+	ceph_x_validate_tickets(ac, &need);
+	dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
+	     ac->want_keys, need, xi->have_keys);
+	return (ac->want_keys & xi->have_keys) == ac->want_keys;
+}
+
+static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+	int need;
+
+	ceph_x_validate_tickets(ac, &need);
+	dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
+	     ac->want_keys, need, xi->have_keys);
+	return need != 0;
+}
+
+static int ceph_x_encrypt_buflen(int ilen)
+{
+	return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
+		sizeof(u32);
+}
+
+static int ceph_x_encrypt(struct ceph_crypto_key *secret,
+			  void *ibuf, int ilen, void *obuf, size_t olen)
+{
+	struct ceph_x_encrypt_header head = {
+		.struct_v = 1,
+		.magic = cpu_to_le64(CEPHX_ENC_MAGIC)
+	};
+	size_t len = olen - sizeof(u32);
+	int ret;
+
+	ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
+			    &head, sizeof(head), ibuf, ilen);
+	if (ret)
+		return ret;
+	ceph_encode_32(&obuf, len);
+	return len + sizeof(u32);
+}
+
+static int ceph_x_decrypt(struct ceph_crypto_key *secret,
+			  void **p, void *end, void *obuf, size_t olen)
+{
+	struct ceph_x_encrypt_header head;
+	size_t head_len = sizeof(head);
+	int len, ret;
+
+	len = ceph_decode_32(p);
+	if (*p + len > end)
+		return -EINVAL;
+
+	dout("ceph_x_decrypt len %d\n", len);
+	ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
+			    *p, len);
+	if (ret)
+		return ret;
+	if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
+		return -EPERM;
+	*p += len;
+	return olen;
+}
+
+/*
+ * get existing (or insert new) ticket handler
+ */
+static struct ceph_x_ticket_handler *
+get_ticket_handler(struct ceph_auth_client *ac, int service)
+{
+	struct ceph_x_ticket_handler *th;
+	struct ceph_x_info *xi = ac->private;
+	struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
+
+	while (*p) {
+		parent = *p;
+		th = rb_entry(parent, struct ceph_x_ticket_handler, node);
+		if (service < th->service)
+			p = &(*p)->rb_left;
+		else if (service > th->service)
+			p = &(*p)->rb_right;
+		else
+			return th;
+	}
+
+	/* add it */
+	th = kzalloc(sizeof(*th), GFP_NOFS);
+	if (!th)
+		return ERR_PTR(-ENOMEM);
+	th->service = service;
+	rb_link_node(&th->node, parent, p);
+	rb_insert_color(&th->node, &xi->ticket_handlers);
+	return th;
+}
+
+static void remove_ticket_handler(struct ceph_auth_client *ac,
+				  struct ceph_x_ticket_handler *th)
+{
+	struct ceph_x_info *xi = ac->private;
+
+	dout("remove_ticket_handler %p %d\n", th, th->service);
+	rb_erase(&th->node, &xi->ticket_handlers);
+	ceph_crypto_key_destroy(&th->session_key);
+	if (th->ticket_blob)
+		ceph_buffer_put(th->ticket_blob);
+	kfree(th);
+}
+
+static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
+				    struct ceph_crypto_key *secret,
+				    void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	int num;
+	void *p = buf;
+	int ret;
+	char *dbuf;
+	char *ticket_buf;
+	u8 reply_struct_v;
+
+	dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
+	if (!dbuf)
+		return -ENOMEM;
+
+	ret = -ENOMEM;
+	ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
+	if (!ticket_buf)
+		goto out_dbuf;
+
+	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+	reply_struct_v = ceph_decode_8(&p);
+	if (reply_struct_v != 1)
+		goto bad;
+	num = ceph_decode_32(&p);
+	dout("%d tickets\n", num);
+	while (num--) {
+		int type;
+		u8 tkt_struct_v, blob_struct_v;
+		struct ceph_x_ticket_handler *th;
+		void *dp, *dend;
+		int dlen;
+		char is_enc;
+		struct timespec validity;
+		struct ceph_crypto_key old_key;
+		void *tp, *tpend;
+		struct ceph_timespec new_validity;
+		struct ceph_crypto_key new_session_key;
+		struct ceph_buffer *new_ticket_blob;
+		unsigned long new_expires, new_renew_after;
+		u64 new_secret_id;
+
+		ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
+
+		type = ceph_decode_32(&p);
+		dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
+
+		tkt_struct_v = ceph_decode_8(&p);
+		if (tkt_struct_v != 1)
+			goto bad;
+
+		th = get_ticket_handler(ac, type);
+		if (IS_ERR(th)) {
+			ret = PTR_ERR(th);
+			goto out;
+		}
+
+		/* blob for me */
+		dlen = ceph_x_decrypt(secret, &p, end, dbuf,
+				      TEMP_TICKET_BUF_LEN);
+		if (dlen <= 0) {
+			ret = dlen;
+			goto out;
+		}
+		dout(" decrypted %d bytes\n", dlen);
+		dend = dbuf + dlen;
+		dp = dbuf;
+
+		tkt_struct_v = ceph_decode_8(&dp);
+		if (tkt_struct_v != 1)
+			goto bad;
+
+		memcpy(&old_key, &th->session_key, sizeof(old_key));
+		ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
+		if (ret)
+			goto out;
+
+		ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
+		ceph_decode_timespec(&validity, &new_validity);
+		new_expires = get_seconds() + validity.tv_sec;
+		new_renew_after = new_expires - (validity.tv_sec / 4);
+		dout(" expires=%lu renew_after=%lu\n", new_expires,
+		     new_renew_after);
+
+		/* ticket blob for service */
+		ceph_decode_8_safe(&p, end, is_enc, bad);
+		tp = ticket_buf;
+		if (is_enc) {
+			/* encrypted */
+			dout(" encrypted ticket\n");
+			dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
+					      TEMP_TICKET_BUF_LEN);
+			if (dlen < 0) {
+				ret = dlen;
+				goto out;
+			}
+			dlen = ceph_decode_32(&tp);
+		} else {
+			/* unencrypted */
+			ceph_decode_32_safe(&p, end, dlen, bad);
+			ceph_decode_need(&p, end, dlen, bad);
+			ceph_decode_copy(&p, ticket_buf, dlen);
+		}
+		tpend = tp + dlen;
+		dout(" ticket blob is %d bytes\n", dlen);
+		ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
+		blob_struct_v = ceph_decode_8(&tp);
+		new_secret_id = ceph_decode_64(&tp);
+		ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
+		if (ret)
+			goto out;
+
+		/* all is well, update our ticket */
+		ceph_crypto_key_destroy(&th->session_key);
+		if (th->ticket_blob)
+			ceph_buffer_put(th->ticket_blob);
+		th->session_key = new_session_key;
+		th->ticket_blob = new_ticket_blob;
+		th->validity = new_validity;
+		th->secret_id = new_secret_id;
+		th->expires = new_expires;
+		th->renew_after = new_renew_after;
+		dout(" got ticket service %d (%s) secret_id %lld len %d\n",
+		     type, ceph_entity_type_name(type), th->secret_id,
+		     (int)th->ticket_blob->vec.iov_len);
+		xi->have_keys |= th->service;
+	}
+
+	ret = 0;
+out:
+	kfree(ticket_buf);
+out_dbuf:
+	kfree(dbuf);
+	return ret;
+
+bad:
+	ret = -EINVAL;
+	goto out;
+}
+
+static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
+				   struct ceph_x_ticket_handler *th,
+				   struct ceph_x_authorizer *au)
+{
+	int maxlen;
+	struct ceph_x_authorize_a *msg_a;
+	struct ceph_x_authorize_b msg_b;
+	void *p, *end;
+	int ret;
+	int ticket_blob_len =
+		(th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
+
+	dout("build_authorizer for %s %p\n",
+	     ceph_entity_type_name(th->service), au);
+
+	maxlen = sizeof(*msg_a) + sizeof(msg_b) +
+		ceph_x_encrypt_buflen(ticket_blob_len);
+	dout("  need len %d\n", maxlen);
+	if (au->buf && au->buf->alloc_len < maxlen) {
+		ceph_buffer_put(au->buf);
+		au->buf = NULL;
+	}
+	if (!au->buf) {
+		au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
+		if (!au->buf)
+			return -ENOMEM;
+	}
+	au->service = th->service;
+
+	msg_a = au->buf->vec.iov_base;
+	msg_a->struct_v = 1;
+	msg_a->global_id = cpu_to_le64(ac->global_id);
+	msg_a->service_id = cpu_to_le32(th->service);
+	msg_a->ticket_blob.struct_v = 1;
+	msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
+	msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
+	if (ticket_blob_len) {
+		memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
+		       th->ticket_blob->vec.iov_len);
+	}
+	dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
+	     le64_to_cpu(msg_a->ticket_blob.secret_id));
+
+	p = msg_a + 1;
+	p += ticket_blob_len;
+	end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+
+	get_random_bytes(&au->nonce, sizeof(au->nonce));
+	msg_b.struct_v = 1;
+	msg_b.nonce = cpu_to_le64(au->nonce);
+	ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
+			     p, end - p);
+	if (ret < 0)
+		goto out_buf;
+	p += ret;
+	au->buf->vec.iov_len = p - au->buf->vec.iov_base;
+	dout(" built authorizer nonce %llx len %d\n", au->nonce,
+	     (int)au->buf->vec.iov_len);
+	BUG_ON(au->buf->vec.iov_len > maxlen);
+	return 0;
+
+out_buf:
+	ceph_buffer_put(au->buf);
+	au->buf = NULL;
+	return ret;
+}
+
+static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
+				void **p, void *end)
+{
+	ceph_decode_need(p, end, 1 + sizeof(u64), bad);
+	ceph_encode_8(p, 1);
+	ceph_encode_64(p, th->secret_id);
+	if (th->ticket_blob) {
+		const char *buf = th->ticket_blob->vec.iov_base;
+		u32 len = th->ticket_blob->vec.iov_len;
+
+		ceph_encode_32_safe(p, end, len, bad);
+		ceph_encode_copy_safe(p, end, buf, len, bad);
+	} else {
+		ceph_encode_32_safe(p, end, 0, bad);
+	}
+
+	return 0;
+bad:
+	return -ERANGE;
+}
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
+{
+	int want = ac->want_keys;
+	struct ceph_x_info *xi = ac->private;
+	int service;
+
+	*pneed = ac->want_keys & ~(xi->have_keys);
+
+	for (service = 1; service <= want; service <<= 1) {
+		struct ceph_x_ticket_handler *th;
+
+		if (!(ac->want_keys & service))
+			continue;
+
+		if (*pneed & service)
+			continue;
+
+		th = get_ticket_handler(ac, service);
+
+		if (IS_ERR(th)) {
+			*pneed |= service;
+			continue;
+		}
+
+		if (get_seconds() >= th->renew_after)
+			*pneed |= service;
+		if (get_seconds() >= th->expires)
+			xi->have_keys &= ~service;
+	}
+}
+
+
+static int ceph_x_build_request(struct ceph_auth_client *ac,
+				void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	int need;
+	struct ceph_x_request_header *head = buf;
+	int ret;
+	struct ceph_x_ticket_handler *th =
+		get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	ceph_x_validate_tickets(ac, &need);
+
+	dout("build_request want %x have %x need %x\n",
+	     ac->want_keys, xi->have_keys, need);
+
+	if (need & CEPH_ENTITY_TYPE_AUTH) {
+		struct ceph_x_authenticate *auth = (void *)(head + 1);
+		void *p = auth + 1;
+		struct ceph_x_challenge_blob tmp;
+		char tmp_enc[40];
+		u64 *u;
+
+		if (p > end)
+			return -ERANGE;
+
+		dout(" get_auth_session_key\n");
+		head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
+
+		/* encrypt and hash */
+		get_random_bytes(&auth->client_challenge, sizeof(u64));
+		tmp.client_challenge = auth->client_challenge;
+		tmp.server_challenge = cpu_to_le64(xi->server_challenge);
+		ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
+				     tmp_enc, sizeof(tmp_enc));
+		if (ret < 0)
+			return ret;
+
+		auth->struct_v = 1;
+		auth->key = 0;
+		for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
+			auth->key ^= *(__le64 *)u;
+		dout(" server_challenge %llx client_challenge %llx key %llx\n",
+		     xi->server_challenge, le64_to_cpu(auth->client_challenge),
+		     le64_to_cpu(auth->key));
+
+		/* now encode the old ticket if exists */
+		ret = ceph_x_encode_ticket(th, &p, end);
+		if (ret < 0)
+			return ret;
+
+		return p - buf;
+	}
+
+	if (need) {
+		void *p = head + 1;
+		struct ceph_x_service_ticket_request *req;
+
+		if (p > end)
+			return -ERANGE;
+		head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
+
+		ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
+		if (ret)
+			return ret;
+		ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
+				 xi->auth_authorizer.buf->vec.iov_len);
+
+		req = p;
+		req->keys = cpu_to_le32(need);
+		p += sizeof(*req);
+		return p - buf;
+	}
+
+	return 0;
+}
+
+static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
+			       void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	struct ceph_x_reply_header *head = buf;
+	struct ceph_x_ticket_handler *th;
+	int len = end - buf;
+	int op;
+	int ret;
+
+	if (result)
+		return result;  /* XXX hmm? */
+
+	if (xi->starting) {
+		/* it's a hello */
+		struct ceph_x_server_challenge *sc = buf;
+
+		if (len != sizeof(*sc))
+			return -EINVAL;
+		xi->server_challenge = le64_to_cpu(sc->server_challenge);
+		dout("handle_reply got server challenge %llx\n",
+		     xi->server_challenge);
+		xi->starting = false;
+		xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
+		return -EAGAIN;
+	}
+
+	op = le16_to_cpu(head->op);
+	result = le32_to_cpu(head->result);
+	dout("handle_reply op %d result %d\n", op, result);
+	switch (op) {
+	case CEPHX_GET_AUTH_SESSION_KEY:
+		/* verify auth key */
+		ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
+					       buf + sizeof(*head), end);
+		break;
+
+	case CEPHX_GET_PRINCIPAL_SESSION_KEY:
+		th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+		if (IS_ERR(th))
+			return PTR_ERR(th);
+		ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+					       buf + sizeof(*head), end);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	if (ret)
+		return ret;
+	if (ac->want_keys == xi->have_keys)
+		return 0;
+	return -EAGAIN;
+}
+
+static int ceph_x_create_authorizer(
+	struct ceph_auth_client *ac, int peer_type,
+	struct ceph_authorizer **a,
+	void **buf, size_t *len,
+	void **reply_buf, size_t *reply_len)
+{
+	struct ceph_x_authorizer *au;
+	struct ceph_x_ticket_handler *th;
+	int ret;
+
+	th = get_ticket_handler(ac, peer_type);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	au = kzalloc(sizeof(*au), GFP_NOFS);
+	if (!au)
+		return -ENOMEM;
+
+	ret = ceph_x_build_authorizer(ac, th, au);
+	if (ret) {
+		kfree(au);
+		return ret;
+	}
+
+	*a = (struct ceph_authorizer *)au;
+	*buf = au->buf->vec.iov_base;
+	*len = au->buf->vec.iov_len;
+	*reply_buf = au->reply_buf;
+	*reply_len = sizeof(au->reply_buf);
+	return 0;
+}
+
+static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
+					  struct ceph_authorizer *a, size_t len)
+{
+	struct ceph_x_authorizer *au = (void *)a;
+	struct ceph_x_ticket_handler *th;
+	int ret = 0;
+	struct ceph_x_authorize_reply reply;
+	void *p = au->reply_buf;
+	void *end = p + sizeof(au->reply_buf);
+
+	th = get_ticket_handler(ac, au->service);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+	ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
+	if (ret < 0)
+		return ret;
+	if (ret != sizeof(reply))
+		return -EPERM;
+
+	if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
+		ret = -EPERM;
+	else
+		ret = 0;
+	dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
+	     au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
+	return ret;
+}
+
+static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a)
+{
+	struct ceph_x_authorizer *au = (void *)a;
+
+	ceph_buffer_put(au->buf);
+	kfree(au);
+}
+
+
+static void ceph_x_reset(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+
+	dout("reset\n");
+	xi->starting = true;
+	xi->server_challenge = 0;
+}
+
+static void ceph_x_destroy(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+	struct rb_node *p;
+
+	dout("ceph_x_destroy %p\n", ac);
+	ceph_crypto_key_destroy(&xi->secret);
+
+	while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
+		struct ceph_x_ticket_handler *th =
+			rb_entry(p, struct ceph_x_ticket_handler, node);
+		remove_ticket_handler(ac, th);
+	}
+
+	if (xi->auth_authorizer.buf)
+		ceph_buffer_put(xi->auth_authorizer.buf);
+
+	kfree(ac->private);
+	ac->private = NULL;
+}
+
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+				   int peer_type)
+{
+	struct ceph_x_ticket_handler *th;
+
+	th = get_ticket_handler(ac, peer_type);
+	if (!IS_ERR(th))
+		remove_ticket_handler(ac, th);
+}
+
+
+static const struct ceph_auth_client_ops ceph_x_ops = {
+	.name = "x",
+	.is_authenticated = ceph_x_is_authenticated,
+	.should_authenticate = ceph_x_should_authenticate,
+	.build_request = ceph_x_build_request,
+	.handle_reply = ceph_x_handle_reply,
+	.create_authorizer = ceph_x_create_authorizer,
+	.verify_authorizer_reply = ceph_x_verify_authorizer_reply,
+	.destroy_authorizer = ceph_x_destroy_authorizer,
+	.invalidate_authorizer = ceph_x_invalidate_authorizer,
+	.reset =  ceph_x_reset,
+	.destroy = ceph_x_destroy,
+};
+
+
+int ceph_x_init(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi;
+	int ret;
+
+	dout("ceph_x_init %p\n", ac);
+	ret = -ENOMEM;
+	xi = kzalloc(sizeof(*xi), GFP_NOFS);
+	if (!xi)
+		goto out;
+
+	ret = -EINVAL;
+	if (!ac->secret) {
+		pr_err("no secret set (for auth_x protocol)\n");
+		goto out_nomem;
+	}
+
+	ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
+	if (ret)
+		goto out_nomem;
+
+	xi->starting = true;
+	xi->ticket_handlers = RB_ROOT;
+
+	ac->protocol = CEPH_AUTH_CEPHX;
+	ac->private = xi;
+	ac->ops = &ceph_x_ops;
+	return 0;
+
+out_nomem:
+	kfree(xi);
+out:
+	return ret;
+}
+
+
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
new file mode 100644
index 00000000000..e02da7a5c5a
--- /dev/null
+++ b/net/ceph/auth_x.h
@@ -0,0 +1,50 @@
+#ifndef _FS_CEPH_AUTH_X_H
+#define _FS_CEPH_AUTH_X_H
+
+#include <linux/rbtree.h>
+
+#include <linux/ceph/auth.h>
+
+#include "crypto.h"
+#include "auth_x_protocol.h"
+
+/*
+ * Handle ticket for a single service.
+ */
+struct ceph_x_ticket_handler {
+	struct rb_node node;
+	unsigned service;
+
+	struct ceph_crypto_key session_key;
+	struct ceph_timespec validity;
+
+	u64 secret_id;
+	struct ceph_buffer *ticket_blob;
+
+	unsigned long renew_after, expires;
+};
+
+
+struct ceph_x_authorizer {
+	struct ceph_buffer *buf;
+	unsigned service;
+	u64 nonce;
+	char reply_buf[128];  /* big enough for encrypted blob */
+};
+
+struct ceph_x_info {
+	struct ceph_crypto_key secret;
+
+	bool starting;
+	u64 server_challenge;
+
+	unsigned have_keys;
+	struct rb_root ticket_handlers;
+
+	struct ceph_x_authorizer auth_authorizer;
+};
+
+extern int ceph_x_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
new file mode 100644
index 00000000000..671d30576c4
--- /dev/null
+++ b/net/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
+#ifndef __FS_CEPH_AUTH_X_PROTOCOL
+#define __FS_CEPH_AUTH_X_PROTOCOL
+
+#define CEPHX_GET_AUTH_SESSION_KEY      0x0100
+#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
+#define CEPHX_GET_ROTATING_KEY          0x0400
+
+/* common bits */
+struct ceph_x_ticket_blob {
+	__u8 struct_v;
+	__le64 secret_id;
+	__le32 blob_len;
+	char blob[];
+} __attribute__ ((packed));
+
+
+/* common request/reply headers */
+struct ceph_x_request_header {
+	__le16 op;
+} __attribute__ ((packed));
+
+struct ceph_x_reply_header {
+	__le16 op;
+	__le32 result;
+} __attribute__ ((packed));
+
+
+/* authenticate handshake */
+
+/* initial hello (no reply header) */
+struct ceph_x_server_challenge {
+	__u8 struct_v;
+	__le64 server_challenge;
+} __attribute__ ((packed));
+
+struct ceph_x_authenticate {
+	__u8 struct_v;
+	__le64 client_challenge;
+	__le64 key;
+	/* ticket blob */
+} __attribute__ ((packed));
+
+struct ceph_x_service_ticket_request {
+	__u8 struct_v;
+	__le32 keys;
+} __attribute__ ((packed));
+
+struct ceph_x_challenge_blob {
+	__le64 server_challenge;
+	__le64 client_challenge;
+} __attribute__ ((packed));
+
+
+
+/* authorize handshake */
+
+/*
+ * The authorizer consists of two pieces:
+ *  a - service id, ticket blob
+ *  b - encrypted with session key
+ */
+struct ceph_x_authorize_a {
+	__u8 struct_v;
+	__le64 global_id;
+	__le32 service_id;
+	struct ceph_x_ticket_blob ticket_blob;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_b {
+	__u8 struct_v;
+	__le64 nonce;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_reply {
+	__u8 struct_v;
+	__le64 nonce_plus_one;
+} __attribute__ ((packed));
+
+
+/*
+ * encyption bundle
+ */
+#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
+
+struct ceph_x_encrypt_header {
+	__u8 struct_v;
+	__le64 magic;
+} __attribute__ ((packed));
+
+#endif
diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c
new file mode 100644
index 00000000000..53d8abfa25d
--- /dev/null
+++ b/net/ceph/buffer.c
@@ -0,0 +1,68 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/buffer.h>
+#include <linux/ceph/decode.h>
+
+struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
+{
+	struct ceph_buffer *b;
+
+	b = kmalloc(sizeof(*b), gfp);
+	if (!b)
+		return NULL;
+
+	b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+	if (b->vec.iov_base) {
+		b->is_vmalloc = false;
+	} else {
+		b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+		if (!b->vec.iov_base) {
+			kfree(b);
+			return NULL;
+		}
+		b->is_vmalloc = true;
+	}
+
+	kref_init(&b->kref);
+	b->alloc_len = len;
+	b->vec.iov_len = len;
+	dout("buffer_new %p\n", b);
+	return b;
+}
+EXPORT_SYMBOL(ceph_buffer_new);
+
+void ceph_buffer_release(struct kref *kref)
+{
+	struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
+
+	dout("buffer_release %p\n", b);
+	if (b->vec.iov_base) {
+		if (b->is_vmalloc)
+			vfree(b->vec.iov_base);
+		else
+			kfree(b->vec.iov_base);
+	}
+	kfree(b);
+}
+EXPORT_SYMBOL(ceph_buffer_release);
+
+int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
+{
+	size_t len;
+
+	ceph_decode_need(p, end, sizeof(u32), bad);
+	len = ceph_decode_32(p);
+	dout("decode_buffer len %d\n", (int)len);
+	ceph_decode_need(p, end, len, bad);
+	*b = ceph_buffer_new(len, GFP_NOFS);
+	if (!*b)
+		return -ENOMEM;
+	ceph_decode_copy(p, (*b)->vec.iov_base, len);
+	return 0;
+bad:
+	return -EINVAL;
+}
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
new file mode 100644
index 00000000000..f3e4a13fea0
--- /dev/null
+++ b/net/ceph/ceph_common.c
@@ -0,0 +1,529 @@
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/backing-dev.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/debugfs.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+
+
+
+/*
+ * find filename portion of a path (/foo/bar/baz -> baz)
+ */
+const char *ceph_file_part(const char *s, int len)
+{
+	const char *e = s + len;
+
+	while (e != s && *(e-1) != '/')
+		e--;
+	return e;
+}
+EXPORT_SYMBOL(ceph_file_part);
+
+const char *ceph_msg_type_name(int type)
+{
+	switch (type) {
+	case CEPH_MSG_SHUTDOWN: return "shutdown";
+	case CEPH_MSG_PING: return "ping";
+	case CEPH_MSG_AUTH: return "auth";
+	case CEPH_MSG_AUTH_REPLY: return "auth_reply";
+	case CEPH_MSG_MON_MAP: return "mon_map";
+	case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
+	case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
+	case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
+	case CEPH_MSG_STATFS: return "statfs";
+	case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+	case CEPH_MSG_MDS_MAP: return "mds_map";
+	case CEPH_MSG_CLIENT_SESSION: return "client_session";
+	case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+	case CEPH_MSG_CLIENT_REQUEST: return "client_request";
+	case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
+	case CEPH_MSG_CLIENT_REPLY: return "client_reply";
+	case CEPH_MSG_CLIENT_CAPS: return "client_caps";
+	case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+	case CEPH_MSG_CLIENT_SNAP: return "client_snap";
+	case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+	case CEPH_MSG_OSD_MAP: return "osd_map";
+	case CEPH_MSG_OSD_OP: return "osd_op";
+	case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
+	default: return "unknown";
+	}
+}
+EXPORT_SYMBOL(ceph_msg_type_name);
+
+/*
+ * Initially learn our fsid, or verify an fsid matches.
+ */
+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+{
+	if (client->have_fsid) {
+		if (ceph_fsid_compare(&client->fsid, fsid)) {
+			pr_err("bad fsid, had %pU got %pU",
+			       &client->fsid, fsid);
+			return -1;
+		}
+	} else {
+		pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
+		memcpy(&client->fsid, fsid, sizeof(*fsid));
+		ceph_debugfs_client_init(client);
+		client->have_fsid = true;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ceph_check_fsid);
+
+static int strcmp_null(const char *s1, const char *s2)
+{
+	if (!s1 && !s2)
+		return 0;
+	if (s1 && !s2)
+		return -1;
+	if (!s1 && s2)
+		return 1;
+	return strcmp(s1, s2);
+}
+
+int ceph_compare_options(struct ceph_options *new_opt,
+			 struct ceph_client *client)
+{
+	struct ceph_options *opt1 = new_opt;
+	struct ceph_options *opt2 = client->options;
+	int ofs = offsetof(struct ceph_options, mon_addr);
+	int i;
+	int ret;
+
+	ret = memcmp(opt1, opt2, ofs);
+	if (ret)
+		return ret;
+
+	ret = strcmp_null(opt1->name, opt2->name);
+	if (ret)
+		return ret;
+
+	ret = strcmp_null(opt1->secret, opt2->secret);
+	if (ret)
+		return ret;
+
+	/* any matching mon ip implies a match */
+	for (i = 0; i < opt1->num_mon; i++) {
+		if (ceph_monmap_contains(client->monc.monmap,
+				 &opt1->mon_addr[i]))
+			return 0;
+	}
+	return -1;
+}
+EXPORT_SYMBOL(ceph_compare_options);
+
+
+static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+{
+	int i = 0;
+	char tmp[3];
+	int err = -EINVAL;
+	int d;
+
+	dout("parse_fsid '%s'\n", str);
+	tmp[2] = 0;
+	while (*str && i < 16) {
+		if (ispunct(*str)) {
+			str++;
+			continue;
+		}
+		if (!isxdigit(str[0]) || !isxdigit(str[1]))
+			break;
+		tmp[0] = str[0];
+		tmp[1] = str[1];
+		if (sscanf(tmp, "%x", &d) < 1)
+			break;
+		fsid->fsid[i] = d & 0xff;
+		i++;
+		str += 2;
+	}
+
+	if (i == 16)
+		err = 0;
+	dout("parse_fsid ret %d got fsid %pU", err, fsid);
+	return err;
+}
+
+/*
+ * ceph options
+ */
+enum {
+	Opt_osdtimeout,
+	Opt_osdkeepalivetimeout,
+	Opt_mount_timeout,
+	Opt_osd_idle_ttl,
+	Opt_last_int,
+	/* int args above */
+	Opt_fsid,
+	Opt_name,
+	Opt_secret,
+	Opt_ip,
+	Opt_last_string,
+	/* string args above */
+	Opt_noshare,
+	Opt_nocrc,
+};
+
+static match_table_t opt_tokens = {
+	{Opt_osdtimeout, "osdtimeout=%d"},
+	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
+	{Opt_mount_timeout, "mount_timeout=%d"},
+	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+	/* int args above */
+	{Opt_fsid, "fsid=%s"},
+	{Opt_name, "name=%s"},
+	{Opt_secret, "secret=%s"},
+	{Opt_ip, "ip=%s"},
+	/* string args above */
+	{Opt_noshare, "noshare"},
+	{Opt_nocrc, "nocrc"},
+	{-1, NULL}
+};
+
+void ceph_destroy_options(struct ceph_options *opt)
+{
+	dout("destroy_options %p\n", opt);
+	kfree(opt->name);
+	kfree(opt->secret);
+	kfree(opt);
+}
+EXPORT_SYMBOL(ceph_destroy_options);
+
+int ceph_parse_options(struct ceph_options **popt, char *options,
+		       const char *dev_name, const char *dev_name_end,
+		       int (*parse_extra_token)(char *c, void *private),
+		       void *private)
+{
+	struct ceph_options *opt;
+	const char *c;
+	int err = -ENOMEM;
+	substring_t argstr[MAX_OPT_ARGS];
+
+	opt = kzalloc(sizeof(*opt), GFP_KERNEL);
+	if (!opt)
+		return err;
+	opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
+				GFP_KERNEL);
+	if (!opt->mon_addr)
+		goto out;
+
+	dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
+	     dev_name);
+
+	/* start with defaults */
+	opt->flags = CEPH_OPT_DEFAULT;
+	opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
+
+	/* get mon ip(s) */
+	/* ip1[:port1][,ip2[:port2]...] */
+	err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
+			     CEPH_MAX_MON, &opt->num_mon);
+	if (err < 0)
+		goto out;
+
+	/* parse mount options */
+	while ((c = strsep(&options, ",")) != NULL) {
+		int token, intval, ret;
+		if (!*c)
+			continue;
+		err = -EINVAL;
+		token = match_token((char *)c, opt_tokens, argstr);
+		if (token < 0 && parse_extra_token) {
+			/* extra? */
+			err = parse_extra_token((char *)c, private);
+			if (err < 0) {
+				pr_err("bad option at '%s'\n", c);
+				goto out;
+			}
+			continue;
+		}
+		if (token < Opt_last_int) {
+			ret = match_int(&argstr[0], &intval);
+			if (ret < 0) {
+				pr_err("bad mount option arg (not int) "
+				       "at '%s'\n", c);
+				continue;
+			}
+			dout("got int token %d val %d\n", token, intval);
+		} else if (token > Opt_last_int && token < Opt_last_string) {
+			dout("got string token %d val %s\n", token,
+			     argstr[0].from);
+		} else {
+			dout("got token %d\n", token);
+		}
+		switch (token) {
+		case Opt_ip:
+			err = ceph_parse_ips(argstr[0].from,
+					     argstr[0].to,
+					     &opt->my_addr,
+					     1, NULL);
+			if (err < 0)
+				goto out;
+			opt->flags |= CEPH_OPT_MYIP;
+			break;
+
+		case Opt_fsid:
+			err = parse_fsid(argstr[0].from, &opt->fsid);
+			if (err == 0)
+				opt->flags |= CEPH_OPT_FSID;
+			break;
+		case Opt_name:
+			opt->name = kstrndup(argstr[0].from,
+					      argstr[0].to-argstr[0].from,
+					      GFP_KERNEL);
+			break;
+		case Opt_secret:
+			opt->secret = kstrndup(argstr[0].from,
+						argstr[0].to-argstr[0].from,
+						GFP_KERNEL);
+			break;
+
+			/* misc */
+		case Opt_osdtimeout:
+			opt->osd_timeout = intval;
+			break;
+		case Opt_osdkeepalivetimeout:
+			opt->osd_keepalive_timeout = intval;
+			break;
+		case Opt_osd_idle_ttl:
+			opt->osd_idle_ttl = intval;
+			break;
+		case Opt_mount_timeout:
+			opt->mount_timeout = intval;
+			break;
+
+		case Opt_noshare:
+			opt->flags |= CEPH_OPT_NOSHARE;
+			break;
+
+		case Opt_nocrc:
+			opt->flags |= CEPH_OPT_NOCRC;
+			break;
+
+		default:
+			BUG_ON(token);
+		}
+	}
+
+	/* success */
+	*popt = opt;
+	return 0;
+
+out:
+	ceph_destroy_options(opt);
+	return err;
+}
+EXPORT_SYMBOL(ceph_parse_options);
+
+u64 ceph_client_id(struct ceph_client *client)
+{
+	return client->monc.auth->global_id;
+}
+EXPORT_SYMBOL(ceph_client_id);
+
+/*
+ * create a fresh client instance
+ */
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
+{
+	struct ceph_client *client;
+	int err = -ENOMEM;
+
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (client == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	client->private = private;
+	client->options = opt;
+
+	mutex_init(&client->mount_mutex);
+	init_waitqueue_head(&client->auth_wq);
+	client->auth_err = 0;
+
+	client->extra_mon_dispatch = NULL;
+	client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT;
+	client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT;
+
+	client->msgr = NULL;
+
+	/* subsystems */
+	err = ceph_monc_init(&client->monc, client);
+	if (err < 0)
+		goto fail;
+	err = ceph_osdc_init(&client->osdc, client);
+	if (err < 0)
+		goto fail_monc;
+
+	return client;
+
+fail_monc:
+	ceph_monc_stop(&client->monc);
+fail:
+	kfree(client);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(ceph_create_client);
+
+void ceph_destroy_client(struct ceph_client *client)
+{
+	dout("destroy_client %p\n", client);
+
+	/* unmount */
+	ceph_osdc_stop(&client->osdc);
+
+	/*
+	 * make sure mds and osd connections close out before destroying
+	 * the auth module, which is needed to free those connections'
+	 * ceph_authorizers.
+	 */
+	ceph_msgr_flush();
+
+	ceph_monc_stop(&client->monc);
+
+	ceph_debugfs_client_cleanup(client);
+
+	if (client->msgr)
+		ceph_messenger_destroy(client->msgr);
+
+	ceph_destroy_options(client->options);
+
+	kfree(client);
+	dout("destroy_client %p done\n", client);
+}
+EXPORT_SYMBOL(ceph_destroy_client);
+
+/*
+ * true if we have the mon map (and have thus joined the cluster)
+ */
+static int have_mon_and_osd_map(struct ceph_client *client)
+{
+	return client->monc.monmap && client->monc.monmap->epoch &&
+	       client->osdc.osdmap && client->osdc.osdmap->epoch;
+}
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+int __ceph_open_session(struct ceph_client *client, unsigned long started)
+{
+	struct ceph_entity_addr *myaddr = NULL;
+	int err;
+	unsigned long timeout = client->options->mount_timeout * HZ;
+
+	/* initialize the messenger */
+	if (client->msgr == NULL) {
+		if (ceph_test_opt(client, MYIP))
+			myaddr = &client->options->my_addr;
+		client->msgr = ceph_messenger_create(myaddr,
+					client->supported_features,
+					client->required_features);
+		if (IS_ERR(client->msgr)) {
+			client->msgr = NULL;
+			return PTR_ERR(client->msgr);
+		}
+		client->msgr->nocrc = ceph_test_opt(client, NOCRC);
+	}
+
+	/* open session, and wait for mon and osd maps */
+	err = ceph_monc_open_session(&client->monc);
+	if (err < 0)
+		return err;
+
+	while (!have_mon_and_osd_map(client)) {
+		err = -EIO;
+		if (timeout && time_after_eq(jiffies, started + timeout))
+			return err;
+
+		/* wait */
+		dout("mount waiting for mon_map\n");
+		err = wait_event_interruptible_timeout(client->auth_wq,
+			have_mon_and_osd_map(client) || (client->auth_err < 0),
+			timeout);
+		if (err == -EINTR || err == -ERESTARTSYS)
+			return err;
+		if (client->auth_err < 0)
+			return client->auth_err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(__ceph_open_session);
+
+
+int ceph_open_session(struct ceph_client *client)
+{
+	int ret;
+	unsigned long started = jiffies;  /* note the start time */
+
+	dout("open_session start\n");
+	mutex_lock(&client->mount_mutex);
+
+	ret = __ceph_open_session(client, started);
+
+	mutex_unlock(&client->mount_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_open_session);
+
+
+static int __init init_ceph_lib(void)
+{
+	int ret = 0;
+
+	ret = ceph_debugfs_init();
+	if (ret < 0)
+		goto out;
+
+	ret = ceph_msgr_init();
+	if (ret < 0)
+		goto out_debugfs;
+
+	pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
+		CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
+		CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
+		CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+
+	return 0;
+
+out_debugfs:
+	ceph_debugfs_cleanup();
+out:
+	return ret;
+}
+
+static void __exit exit_ceph_lib(void)
+{
+	dout("exit_ceph_lib\n");
+	ceph_msgr_exit();
+	ceph_debugfs_cleanup();
+}
+
+module_init(init_ceph_lib);
+module_exit(exit_ceph_lib);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
new file mode 100644
index 00000000000..a3a3a31d3c3
--- /dev/null
+++ b/net/ceph/ceph_fs.c
@@ -0,0 +1,75 @@
+/*
+ * Some non-inline ceph helpers
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+/*
+ * return true if @layout appears to be valid
+ */
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
+{
+	__u32 su = le32_to_cpu(layout->fl_stripe_unit);
+	__u32 sc = le32_to_cpu(layout->fl_stripe_count);
+	__u32 os = le32_to_cpu(layout->fl_object_size);
+
+	/* stripe unit, object size must be non-zero, 64k increment */
+	if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
+		return 0;
+	if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
+		return 0;
+	/* object size must be a multiple of stripe unit */
+	if (os < su || os % su)
+		return 0;
+	/* stripe count must be non-zero */
+	if (!sc)
+		return 0;
+	return 1;
+}
+
+
+int ceph_flags_to_mode(int flags)
+{
+	int mode;
+
+#ifdef O_DIRECTORY  /* fixme */
+	if ((flags & O_DIRECTORY) == O_DIRECTORY)
+		return CEPH_FILE_MODE_PIN;
+#endif
+	if ((flags & O_APPEND) == O_APPEND)
+		flags |= O_WRONLY;
+
+	if ((flags & O_ACCMODE) == O_RDWR)
+		mode = CEPH_FILE_MODE_RDWR;
+	else if ((flags & O_ACCMODE) == O_WRONLY)
+		mode = CEPH_FILE_MODE_WR;
+	else
+		mode = CEPH_FILE_MODE_RD;
+
+#ifdef O_LAZY
+	if (flags & O_LAZY)
+		mode |= CEPH_FILE_MODE_LAZY;
+#endif
+
+	return mode;
+}
+EXPORT_SYMBOL(ceph_flags_to_mode);
+
+int ceph_caps_for_mode(int mode)
+{
+	int caps = CEPH_CAP_PIN;
+
+	if (mode & CEPH_FILE_MODE_RD)
+		caps |= CEPH_CAP_FILE_SHARED |
+			CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+	if (mode & CEPH_FILE_MODE_WR)
+		caps |= CEPH_CAP_FILE_EXCL |
+			CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+			CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+			CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+	if (mode & CEPH_FILE_MODE_LAZY)
+		caps |= CEPH_CAP_FILE_LAZYIO;
+
+	return caps;
+}
+EXPORT_SYMBOL(ceph_caps_for_mode);
diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c
new file mode 100644
index 00000000000..815ef882679
--- /dev/null
+++ b/net/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
+
+#include <linux/ceph/types.h>
+
+/*
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c)						\
+	do {							\
+		a = a - b;  a = a - c;  a = a ^ (c >> 13);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 8);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 13);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 12);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 16);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 5);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 3);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 10);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 15);	\
+	} while (0)
+
+unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
+{
+	const unsigned char *k = (const unsigned char *)str;
+	__u32 a, b, c;  /* the internal state */
+	__u32 len;      /* how many key bytes still need mixing */
+
+	/* Set up the internal state */
+	len = length;
+	a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
+	b = a;
+	c = 0;               /* variable initialization of internal state */
+
+	/* handle most of the key */
+	while (len >= 12) {
+		a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+			 ((__u32)k[3] << 24));
+		b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+			 ((__u32)k[7] << 24));
+		c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+			 ((__u32)k[11] << 24));
+		mix(a, b, c);
+		k = k + 12;
+		len = len - 12;
+	}
+
+	/* handle the last 11 bytes */
+	c = c + length;
+	switch (len) {            /* all the case statements fall through */
+	case 11:
+		c = c + ((__u32)k[10] << 24);
+	case 10:
+		c = c + ((__u32)k[9] << 16);
+	case 9:
+		c = c + ((__u32)k[8] << 8);
+		/* the first byte of c is reserved for the length */
+	case 8:
+		b = b + ((__u32)k[7] << 24);
+	case 7:
+		b = b + ((__u32)k[6] << 16);
+	case 6:
+		b = b + ((__u32)k[5] << 8);
+	case 5:
+		b = b + k[4];
+	case 4:
+		a = a + ((__u32)k[3] << 24);
+	case 3:
+		a = a + ((__u32)k[2] << 16);
+	case 2:
+		a = a + ((__u32)k[1] << 8);
+	case 1:
+		a = a + k[0];
+		/* case 0: nothing left to add */
+	}
+	mix(a, b, c);
+
+	return c;
+}
+
+/*
+ * linux dcache hash
+ */
+unsigned ceph_str_hash_linux(const char *str, unsigned length)
+{
+	unsigned long hash = 0;
+	unsigned char c;
+
+	while (length--) {
+		c = *str++;
+		hash = (hash + (c << 4) + (c >> 4)) * 11;
+	}
+	return hash;
+}
+
+
+unsigned ceph_str_hash(int type, const char *s, unsigned len)
+{
+	switch (type) {
+	case CEPH_STR_HASH_LINUX:
+		return ceph_str_hash_linux(s, len);
+	case CEPH_STR_HASH_RJENKINS:
+		return ceph_str_hash_rjenkins(s, len);
+	default:
+		return -1;
+	}
+}
+
+const char *ceph_str_hash_name(int type)
+{
+	switch (type) {
+	case CEPH_STR_HASH_LINUX:
+		return "linux";
+	case CEPH_STR_HASH_RJENKINS:
+		return "rjenkins";
+	default:
+		return "unknown";
+	}
+}
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
new file mode 100644
index 00000000000..3fbda04de29
--- /dev/null
+++ b/net/ceph/ceph_strings.c
@@ -0,0 +1,84 @@
+/*
+ * Ceph string constants
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+const char *ceph_entity_type_name(int type)
+{
+	switch (type) {
+	case CEPH_ENTITY_TYPE_MDS: return "mds";
+	case CEPH_ENTITY_TYPE_OSD: return "osd";
+	case CEPH_ENTITY_TYPE_MON: return "mon";
+	case CEPH_ENTITY_TYPE_CLIENT: return "client";
+	case CEPH_ENTITY_TYPE_AUTH: return "auth";
+	default: return "unknown";
+	}
+}
+
+const char *ceph_osd_op_name(int op)
+{
+	switch (op) {
+	case CEPH_OSD_OP_READ: return "read";
+	case CEPH_OSD_OP_STAT: return "stat";
+
+	case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
+
+	case CEPH_OSD_OP_WRITE: return "write";
+	case CEPH_OSD_OP_DELETE: return "delete";
+	case CEPH_OSD_OP_TRUNCATE: return "truncate";
+	case CEPH_OSD_OP_ZERO: return "zero";
+	case CEPH_OSD_OP_WRITEFULL: return "writefull";
+	case CEPH_OSD_OP_ROLLBACK: return "rollback";
+
+	case CEPH_OSD_OP_APPEND: return "append";
+	case CEPH_OSD_OP_STARTSYNC: return "startsync";
+	case CEPH_OSD_OP_SETTRUNC: return "settrunc";
+	case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
+
+	case CEPH_OSD_OP_TMAPUP: return "tmapup";
+	case CEPH_OSD_OP_TMAPGET: return "tmapget";
+	case CEPH_OSD_OP_TMAPPUT: return "tmapput";
+
+	case CEPH_OSD_OP_GETXATTR: return "getxattr";
+	case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
+	case CEPH_OSD_OP_SETXATTR: return "setxattr";
+	case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
+	case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
+	case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+	case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
+
+	case CEPH_OSD_OP_PULL: return "pull";
+	case CEPH_OSD_OP_PUSH: return "push";
+	case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
+	case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
+	case CEPH_OSD_OP_SCRUB: return "scrub";
+
+	case CEPH_OSD_OP_WRLOCK: return "wrlock";
+	case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
+	case CEPH_OSD_OP_RDLOCK: return "rdlock";
+	case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
+	case CEPH_OSD_OP_UPLOCK: return "uplock";
+	case CEPH_OSD_OP_DNLOCK: return "dnlock";
+
+	case CEPH_OSD_OP_CALL: return "call";
+
+	case CEPH_OSD_OP_PGLS: return "pgls";
+	}
+	return "???";
+}
+
+
+const char *ceph_pool_op_name(int op)
+{
+	switch (op) {
+	case POOL_OP_CREATE: return "create";
+	case POOL_OP_DELETE: return "delete";
+	case POOL_OP_AUID_CHANGE: return "auid change";
+	case POOL_OP_CREATE_SNAP: return "create snap";
+	case POOL_OP_DELETE_SNAP: return "delete snap";
+	case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+	case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+	}
+	return "???";
+}
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
new file mode 100644
index 00000000000..d6ebb13a18a
--- /dev/null
+++ b/net/ceph/crush/crush.c
@@ -0,0 +1,151 @@
+
+#ifdef __KERNEL__
+# include <linux/slab.h>
+#else
+# include <stdlib.h>
+# include <assert.h>
+# define kfree(x) do { if (x) free(x); } while (0)
+# define BUG_ON(x) assert(!(x))
+#endif
+
+#include <linux/crush/crush.h>
+
+const char *crush_bucket_alg_name(int alg)
+{
+	switch (alg) {
+	case CRUSH_BUCKET_UNIFORM: return "uniform";
+	case CRUSH_BUCKET_LIST: return "list";
+	case CRUSH_BUCKET_TREE: return "tree";
+	case CRUSH_BUCKET_STRAW: return "straw";
+	default: return "unknown";
+	}
+}
+
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
+{
+	if (p >= b->size)
+		return 0;
+
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return ((struct crush_bucket_uniform *)b)->item_weight;
+	case CRUSH_BUCKET_LIST:
+		return ((struct crush_bucket_list *)b)->item_weights[p];
+	case CRUSH_BUCKET_TREE:
+		if (p & 1)
+			return ((struct crush_bucket_tree *)b)->node_weights[p];
+		return 0;
+	case CRUSH_BUCKET_STRAW:
+		return ((struct crush_bucket_straw *)b)->item_weights[p];
+	}
+	return 0;
+}
+
+/**
+ * crush_calc_parents - Calculate parent vectors for the given crush map.
+ * @map: crush_map pointer
+ */
+void crush_calc_parents(struct crush_map *map)
+{
+	int i, b, c;
+
+	for (b = 0; b < map->max_buckets; b++) {
+		if (map->buckets[b] == NULL)
+			continue;
+		for (i = 0; i < map->buckets[b]->size; i++) {
+			c = map->buckets[b]->items[i];
+			BUG_ON(c >= map->max_devices ||
+			       c < -map->max_buckets);
+			if (c >= 0)
+				map->device_parents[c] = map->buckets[b]->id;
+			else
+				map->bucket_parents[-1-c] = map->buckets[b]->id;
+		}
+	}
+}
+
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+	kfree(b->item_weights);
+	kfree(b->sum_weights);
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+	kfree(b->node_weights);
+	kfree(b);
+}
+
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+	kfree(b->straws);
+	kfree(b->item_weights);
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+		break;
+	case CRUSH_BUCKET_LIST:
+		crush_destroy_bucket_list((struct crush_bucket_list *)b);
+		break;
+	case CRUSH_BUCKET_TREE:
+		crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+		break;
+	case CRUSH_BUCKET_STRAW:
+		crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+		break;
+	}
+}
+
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+	int b;
+
+	/* buckets */
+	if (map->buckets) {
+		for (b = 0; b < map->max_buckets; b++) {
+			if (map->buckets[b] == NULL)
+				continue;
+			crush_destroy_bucket(map->buckets[b]);
+		}
+		kfree(map->buckets);
+	}
+
+	/* rules */
+	if (map->rules) {
+		for (b = 0; b < map->max_rules; b++)
+			kfree(map->rules[b]);
+		kfree(map->rules);
+	}
+
+	kfree(map->bucket_parents);
+	kfree(map->device_parents);
+	kfree(map);
+}
+
+
diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c
new file mode 100644
index 00000000000..5bb63e37a8a
--- /dev/null
+++ b/net/ceph/crush/hash.c
@@ -0,0 +1,149 @@
+
+#include <linux/types.h>
+#include <linux/crush/hash.h>
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do {			\
+		a = a-b;  a = a-c;  a = a^(c>>13);	\
+		b = b-c;  b = b-a;  b = b^(a<<8);	\
+		c = c-a;  c = c-b;  c = c^(b>>13);	\
+		a = a-b;  a = a-c;  a = a^(c>>12);	\
+		b = b-c;  b = b-a;  b = b^(a<<16);	\
+		c = c-a;  c = c-b;  c = c^(b>>5);	\
+		a = a-b;  a = a-c;  a = a^(c>>3);	\
+		b = b-c;  b = b-a;  b = b^(a<<10);	\
+		c = c-a;  c = c-b;  c = c^(b>>15);	\
+	} while (0)
+
+#define crush_hash_seed 1315423911
+
+static __u32 crush_hash32_rjenkins1(__u32 a)
+{
+	__u32 hash = crush_hash_seed ^ a;
+	__u32 b = a;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, a, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(x, a, hash);
+	crush_hashmix(b, y, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, x, hash);
+	crush_hashmix(y, a, hash);
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, c, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, d, hash);
+	crush_hashmix(a, x, hash);
+	crush_hashmix(y, b, hash);
+	crush_hashmix(c, x, hash);
+	crush_hashmix(y, d, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+				      __u32 e)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, d, hash);
+	crush_hashmix(e, x, hash);
+	crush_hashmix(y, a, hash);
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, c, hash);
+	crush_hashmix(d, x, hash);
+	crush_hashmix(y, e, hash);
+	return hash;
+}
+
+
+__u32 crush_hash32(int type, __u32 a)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1(a);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_2(a, b);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_3(a, b, c);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_4(a, b, c, d);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_5(a, b, c, d, e);
+	default:
+		return 0;
+	}
+}
+
+const char *crush_hash_name(int type)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return "rjenkins1";
+	default:
+		return "unknown";
+	}
+}
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
new file mode 100644
index 00000000000..42599e31dca
--- /dev/null
+++ b/net/ceph/crush/mapper.c
@@ -0,0 +1,609 @@
+
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# ifndef dprintk
+#  define dprintk(args...)
+# endif
+#else
+# include <string.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <assert.h>
+# define BUG_ON(x) assert(!(x))
+# define dprintk(args...) /* printf(args) */
+# define kmalloc(x, f) malloc(x)
+# define kfree(x) free(x)
+#endif
+
+#include <linux/crush/crush.h>
+#include <linux/crush/hash.h>
+
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
+{
+	int i;
+
+	for (i = 0; i < map->max_rules; i++) {
+		if (map->rules[i] &&
+		    map->rules[i]->mask.ruleset == ruleset &&
+		    map->rules[i]->mask.type == type &&
+		    map->rules[i]->mask.min_size <= size &&
+		    map->rules[i]->mask.max_size >= size)
+			return i;
+	}
+	return -1;
+}
+
+
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors.  Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(struct crush_bucket *bucket,
+			      int x, int r)
+{
+	unsigned pr = r % bucket->size;
+	unsigned i, s;
+
+	/* start a new permutation if @x has changed */
+	if (bucket->perm_x != x || bucket->perm_n == 0) {
+		dprintk("bucket %d new x=%d\n", bucket->id, x);
+		bucket->perm_x = x;
+
+		/* optimize common r=0 case */
+		if (pr == 0) {
+			s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
+				bucket->size;
+			bucket->perm[0] = s;
+			bucket->perm_n = 0xffff;   /* magic value, see below */
+			goto out;
+		}
+
+		for (i = 0; i < bucket->size; i++)
+			bucket->perm[i] = i;
+		bucket->perm_n = 0;
+	} else if (bucket->perm_n == 0xffff) {
+		/* clean up after the r=0 case above */
+		for (i = 1; i < bucket->size; i++)
+			bucket->perm[i] = i;
+		bucket->perm[bucket->perm[0]] = 0;
+		bucket->perm_n = 1;
+	}
+
+	/* calculate permutation up to pr */
+	for (i = 0; i < bucket->perm_n; i++)
+		dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
+	while (bucket->perm_n <= pr) {
+		unsigned p = bucket->perm_n;
+		/* no point in swapping the final entry */
+		if (p < bucket->size - 1) {
+			i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
+				(bucket->size - p);
+			if (i) {
+				unsigned t = bucket->perm[p + i];
+				bucket->perm[p + i] = bucket->perm[p];
+				bucket->perm[p] = t;
+			}
+			dprintk(" perm_choose swap %d with %d\n", p, p+i);
+		}
+		bucket->perm_n++;
+	}
+	for (i = 0; i < bucket->size; i++)
+		dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
+
+	s = bucket->perm[pr];
+out:
+	dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+		bucket->size, x, r, pr, s);
+	return bucket->items[s];
+}
+
+/* uniform */
+static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
+				 int x, int r)
+{
+	return bucket_perm_choose(&bucket->h, x, r);
+}
+
+/* list */
+static int bucket_list_choose(struct crush_bucket_list *bucket,
+			      int x, int r)
+{
+	int i;
+
+	for (i = bucket->h.size-1; i >= 0; i--) {
+		__u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+					 r, bucket->h.id);
+		w &= 0xffff;
+		dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+			"sw %x rand %llx",
+			i, x, r, bucket->h.items[i], bucket->item_weights[i],
+			bucket->sum_weights[i], w);
+		w *= bucket->sum_weights[i];
+		w = w >> 16;
+		/*dprintk(" scaled %llx\n", w);*/
+		if (w < bucket->item_weights[i])
+			return bucket->h.items[i];
+	}
+
+	BUG_ON(1);
+	return 0;
+}
+
+
+/* (binary) tree */
+static int height(int n)
+{
+	int h = 0;
+	while ((n & 1) == 0) {
+		h++;
+		n = n >> 1;
+	}
+	return h;
+}
+
+static int left(int x)
+{
+	int h = height(x);
+	return x - (1 << (h-1));
+}
+
+static int right(int x)
+{
+	int h = height(x);
+	return x + (1 << (h-1));
+}
+
+static int terminal(int x)
+{
+	return x & 1;
+}
+
+static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+			      int x, int r)
+{
+	int n, l;
+	__u32 w;
+	__u64 t;
+
+	/* start at root */
+	n = bucket->num_nodes >> 1;
+
+	while (!terminal(n)) {
+		/* pick point in [0, w) */
+		w = bucket->node_weights[n];
+		t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+					  bucket->h.id) * (__u64)w;
+		t = t >> 32;
+
+		/* descend to the left or right? */
+		l = left(n);
+		if (t < bucket->node_weights[l])
+			n = l;
+		else
+			n = right(n);
+	}
+
+	return bucket->h.items[n >> 1];
+}
+
+
+/* straw */
+
+static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+			       int x, int r)
+{
+	int i;
+	int high = 0;
+	__u64 high_draw = 0;
+	__u64 draw;
+
+	for (i = 0; i < bucket->h.size; i++) {
+		draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
+		draw &= 0xffff;
+		draw *= bucket->straws[i];
+		if (i == 0 || draw > high_draw) {
+			high = i;
+			high_draw = draw;
+		}
+	}
+	return bucket->h.items[high];
+}
+
+static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+{
+	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
+	switch (in->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return bucket_uniform_choose((struct crush_bucket_uniform *)in,
+					  x, r);
+	case CRUSH_BUCKET_LIST:
+		return bucket_list_choose((struct crush_bucket_list *)in,
+					  x, r);
+	case CRUSH_BUCKET_TREE:
+		return bucket_tree_choose((struct crush_bucket_tree *)in,
+					  x, r);
+	case CRUSH_BUCKET_STRAW:
+		return bucket_straw_choose((struct crush_bucket_straw *)in,
+					   x, r);
+	default:
+		BUG_ON(1);
+		return in->items[0];
+	}
+}
+
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
+{
+	if (weight[item] >= 0x10000)
+		return 0;
+	if (weight[item] == 0)
+		return 1;
+	if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+	    < weight[item])
+		return 0;
+	return 1;
+}
+
+/**
+ * crush_choose - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @firstn: true if choosing "first n" items, false if choosing "indep"
+ * @recurse_to_leaf: true if we want one device under each item of given type
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ */
+static int crush_choose(struct crush_map *map,
+			struct crush_bucket *bucket,
+			__u32 *weight,
+			int x, int numrep, int type,
+			int *out, int outpos,
+			int firstn, int recurse_to_leaf,
+			int *out2)
+{
+	int rep;
+	int ftotal, flocal;
+	int retry_descent, retry_bucket, skip_rep;
+	struct crush_bucket *in = bucket;
+	int r;
+	int i;
+	int item = 0;
+	int itemtype;
+	int collide, reject;
+	const int orig_tries = 5; /* attempts before we fall back to search */
+
+	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+		bucket->id, x, outpos, numrep);
+
+	for (rep = outpos; rep < numrep; rep++) {
+		/* keep trying until we get a non-out, non-colliding item */
+		ftotal = 0;
+		skip_rep = 0;
+		do {
+			retry_descent = 0;
+			in = bucket;               /* initial bucket */
+
+			/* choose through intervening buckets */
+			flocal = 0;
+			do {
+				collide = 0;
+				retry_bucket = 0;
+				r = rep;
+				if (in->alg == CRUSH_BUCKET_UNIFORM) {
+					/* be careful */
+					if (firstn || numrep >= in->size)
+						/* r' = r + f_total */
+						r += ftotal;
+					else if (in->size % numrep == 0)
+						/* r'=r+(n+1)*f_local */
+						r += (numrep+1) *
+							(flocal+ftotal);
+					else
+						/* r' = r + n*f_local */
+						r += numrep * (flocal+ftotal);
+				} else {
+					if (firstn)
+						/* r' = r + f_total */
+						r += ftotal;
+					else
+						/* r' = r + n*f_local */
+						r += numrep * (flocal+ftotal);
+				}
+
+				/* bucket choose */
+				if (in->size == 0) {
+					reject = 1;
+					goto reject;
+				}
+				if (flocal >= (in->size>>1) &&
+				    flocal > orig_tries)
+					item = bucket_perm_choose(in, x, r);
+				else
+					item = crush_bucket_choose(in, x, r);
+				BUG_ON(item >= map->max_devices);
+
+				/* desired type? */
+				if (item < 0)
+					itemtype = map->buckets[-1-item]->type;
+				else
+					itemtype = 0;
+				dprintk("  item %d type %d\n", item, itemtype);
+
+				/* keep going? */
+				if (itemtype != type) {
+					BUG_ON(item >= 0 ||
+					       (-1-item) >= map->max_buckets);
+					in = map->buckets[-1-item];
+					retry_bucket = 1;
+					continue;
+				}
+
+				/* collision? */
+				for (i = 0; i < outpos; i++) {
+					if (out[i] == item) {
+						collide = 1;
+						break;
+					}
+				}
+
+				reject = 0;
+				if (recurse_to_leaf) {
+					if (item < 0) {
+						if (crush_choose(map,
+							 map->buckets[-1-item],
+							 weight,
+							 x, outpos+1, 0,
+							 out2, outpos,
+							 firstn, 0,
+							 NULL) <= outpos)
+							/* didn't get leaf */
+							reject = 1;
+					} else {
+						/* we already have a leaf! */
+						out2[outpos] = item;
+					}
+				}
+
+				if (!reject) {
+					/* out? */
+					if (itemtype == 0)
+						reject = is_out(map, weight,
+								item, x);
+					else
+						reject = 0;
+				}
+
+reject:
+				if (reject || collide) {
+					ftotal++;
+					flocal++;
+
+					if (collide && flocal < 3)
+						/* retry locally a few times */
+						retry_bucket = 1;
+					else if (flocal < in->size + orig_tries)
+						/* exhaustive bucket search */
+						retry_bucket = 1;
+					else if (ftotal < 20)
+						/* then retry descent */
+						retry_descent = 1;
+					else
+						/* else give up */
+						skip_rep = 1;
+					dprintk("  reject %d  collide %d  "
+						"ftotal %d  flocal %d\n",
+						reject, collide, ftotal,
+						flocal);
+				}
+			} while (retry_bucket);
+		} while (retry_descent);
+
+		if (skip_rep) {
+			dprintk("skip rep\n");
+			continue;
+		}
+
+		dprintk("CHOOSE got %d\n", item);
+		out[outpos] = item;
+		outpos++;
+	}
+
+	dprintk("CHOOSE returns %d\n", outpos);
+	return outpos;
+}
+
+
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @force: force initial replica choice; -1 for none
+ */
+int crush_do_rule(struct crush_map *map,
+		  int ruleno, int x, int *result, int result_max,
+		  int force, __u32 *weight)
+{
+	int result_len;
+	int force_context[CRUSH_MAX_DEPTH];
+	int force_pos = -1;
+	int a[CRUSH_MAX_SET];
+	int b[CRUSH_MAX_SET];
+	int c[CRUSH_MAX_SET];
+	int recurse_to_leaf;
+	int *w;
+	int wsize = 0;
+	int *o;
+	int osize;
+	int *tmp;
+	struct crush_rule *rule;
+	int step;
+	int i, j;
+	int numrep;
+	int firstn;
+	int rc = -1;
+
+	BUG_ON(ruleno >= map->max_rules);
+
+	rule = map->rules[ruleno];
+	result_len = 0;
+	w = a;
+	o = b;
+
+	/*
+	 * determine hierarchical context of force, if any.  note
+	 * that this may or may not correspond to the specific types
+	 * referenced by the crush rule.
+	 */
+	if (force >= 0) {
+		if (force >= map->max_devices ||
+		    map->device_parents[force] == 0) {
+			/*dprintk("CRUSH: forcefed device dne\n");*/
+			rc = -1;  /* force fed device dne */
+			goto out;
+		}
+		if (!is_out(map, weight, force, x)) {
+			while (1) {
+				force_context[++force_pos] = force;
+				if (force >= 0)
+					force = map->device_parents[force];
+				else
+					force = map->bucket_parents[-1-force];
+				if (force == 0)
+					break;
+			}
+		}
+	}
+
+	for (step = 0; step < rule->len; step++) {
+		firstn = 0;
+		switch (rule->steps[step].op) {
+		case CRUSH_RULE_TAKE:
+			w[0] = rule->steps[step].arg1;
+			if (force_pos >= 0) {
+				BUG_ON(force_context[force_pos] != w[0]);
+				force_pos--;
+			}
+			wsize = 1;
+			break;
+
+		case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
+		case CRUSH_RULE_CHOOSE_FIRSTN:
+			firstn = 1;
+		case CRUSH_RULE_CHOOSE_LEAF_INDEP:
+		case CRUSH_RULE_CHOOSE_INDEP:
+			BUG_ON(wsize == 0);
+
+			recurse_to_leaf =
+				rule->steps[step].op ==
+				 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
+				rule->steps[step].op ==
+				CRUSH_RULE_CHOOSE_LEAF_INDEP;
+
+			/* reset output */
+			osize = 0;
+
+			for (i = 0; i < wsize; i++) {
+				/*
+				 * see CRUSH_N, CRUSH_N_MINUS macros.
+				 * basically, numrep <= 0 means relative to
+				 * the provided result_max
+				 */
+				numrep = rule->steps[step].arg1;
+				if (numrep <= 0) {
+					numrep += result_max;
+					if (numrep <= 0)
+						continue;
+				}
+				j = 0;
+				if (osize == 0 && force_pos >= 0) {
+					/* skip any intermediate types */
+					while (force_pos &&
+					       force_context[force_pos] < 0 &&
+					       rule->steps[step].arg2 !=
+					       map->buckets[-1 -
+					       force_context[force_pos]]->type)
+						force_pos--;
+					o[osize] = force_context[force_pos];
+					if (recurse_to_leaf)
+						c[osize] = force_context[0];
+					j++;
+					force_pos--;
+				}
+				osize += crush_choose(map,
+						      map->buckets[-1-w[i]],
+						      weight,
+						      x, numrep,
+						      rule->steps[step].arg2,
+						      o+osize, j,
+						      firstn,
+						      recurse_to_leaf, c+osize);
+			}
+
+			if (recurse_to_leaf)
+				/* copy final _leaf_ values to output set */
+				memcpy(o, c, osize*sizeof(*o));
+
+			/* swap t and w arrays */
+			tmp = o;
+			o = w;
+			w = tmp;
+			wsize = osize;
+			break;
+
+
+		case CRUSH_RULE_EMIT:
+			for (i = 0; i < wsize && result_len < result_max; i++) {
+				result[result_len] = w[i];
+				result_len++;
+			}
+			wsize = 0;
+			break;
+
+		default:
+			BUG_ON(1);
+		}
+	}
+	rc = result_len;
+
+out:
+	return rc;
+}
+
+
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
new file mode 100644
index 00000000000..7b505b0c983
--- /dev/null
+++ b/net/ceph/crypto.c
@@ -0,0 +1,412 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <crypto/hash.h>
+
+#include <linux/ceph/decode.h>
+#include "crypto.h"
+
+int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
+{
+	if (*p + sizeof(u16) + sizeof(key->created) +
+	    sizeof(u16) + key->len > end)
+		return -ERANGE;
+	ceph_encode_16(p, key->type);
+	ceph_encode_copy(p, &key->created, sizeof(key->created));
+	ceph_encode_16(p, key->len);
+	ceph_encode_copy(p, key->key, key->len);
+	return 0;
+}
+
+int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
+{
+	ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
+	key->type = ceph_decode_16(p);
+	ceph_decode_copy(p, &key->created, sizeof(key->created));
+	key->len = ceph_decode_16(p);
+	ceph_decode_need(p, end, key->len, bad);
+	key->key = kmalloc(key->len, GFP_NOFS);
+	if (!key->key)
+		return -ENOMEM;
+	ceph_decode_copy(p, key->key, key->len);
+	return 0;
+
+bad:
+	dout("failed to decode crypto key\n");
+	return -EINVAL;
+}
+
+int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
+{
+	int inlen = strlen(inkey);
+	int blen = inlen * 3 / 4;
+	void *buf, *p;
+	int ret;
+
+	dout("crypto_key_unarmor %s\n", inkey);
+	buf = kmalloc(blen, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+	blen = ceph_unarmor(buf, inkey, inkey+inlen);
+	if (blen < 0) {
+		kfree(buf);
+		return blen;
+	}
+
+	p = buf;
+	ret = ceph_crypto_key_decode(key, &p, p + blen);
+	kfree(buf);
+	if (ret)
+		return ret;
+	dout("crypto_key_unarmor key %p type %d len %d\n", key,
+	     key->type, key->len);
+	return 0;
+}
+
+
+
+#define AES_KEY_SIZE 16
+
+static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
+{
+	return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+}
+
+static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
+
+static int ceph_aes_encrypt(const void *key, int key_len,
+			    void *dst, size_t *dst_len,
+			    const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[2], sg_out[1];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+	int ret;
+	void *iv;
+	int ivsize;
+	size_t zero_padding = (0x10 - (src_len & 0x0f));
+	char pad[16];
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	memset(pad, zero_padding, zero_padding);
+
+	*dst_len = src_len + zero_padding;
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 2);
+	sg_set_buf(&sg_in[0], src, src_len);
+	sg_set_buf(&sg_in[1], pad, zero_padding);
+	sg_init_table(sg_out, 1);
+	sg_set_buf(sg_out, dst, *dst_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+	/*
+	print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
+			src, src_len, 1);
+	print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
+			pad, zero_padding, 1);
+	*/
+	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+				     src_len + zero_padding);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0)
+		pr_err("ceph_aes_crypt failed %d\n", ret);
+	/*
+	print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
+			     size_t *dst_len,
+			     const void *src1, size_t src1_len,
+			     const void *src2, size_t src2_len)
+{
+	struct scatterlist sg_in[3], sg_out[1];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+	int ret;
+	void *iv;
+	int ivsize;
+	size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
+	char pad[16];
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	memset(pad, zero_padding, zero_padding);
+
+	*dst_len = src1_len + src2_len + zero_padding;
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 3);
+	sg_set_buf(&sg_in[0], src1, src1_len);
+	sg_set_buf(&sg_in[1], src2, src2_len);
+	sg_set_buf(&sg_in[2], pad, zero_padding);
+	sg_init_table(sg_out, 1);
+	sg_set_buf(sg_out, dst, *dst_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+	/*
+	print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
+			src1, src1_len, 1);
+	print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
+			src2, src2_len, 1);
+	print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
+			pad, zero_padding, 1);
+	*/
+	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+				     src1_len + src2_len + zero_padding);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0)
+		pr_err("ceph_aes_crypt2 failed %d\n", ret);
+	/*
+	print_hex_dump(KERN_ERR, "enc  out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+static int ceph_aes_decrypt(const void *key, int key_len,
+			    void *dst, size_t *dst_len,
+			    const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[1], sg_out[2];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm };
+	char pad[16];
+	void *iv;
+	int ivsize;
+	int ret;
+	int last_byte;
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 1);
+	sg_init_table(sg_out, 2);
+	sg_set_buf(sg_in, src, src_len);
+	sg_set_buf(&sg_out[0], dst, *dst_len);
+	sg_set_buf(&sg_out[1], pad, sizeof(pad));
+
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+
+	/*
+	print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
+		       src, src_len, 1);
+	*/
+
+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0) {
+		pr_err("ceph_aes_decrypt failed %d\n", ret);
+		return ret;
+	}
+
+	if (src_len <= *dst_len)
+		last_byte = ((char *)dst)[src_len - 1];
+	else
+		last_byte = pad[src_len - *dst_len - 1];
+	if (last_byte <= 16 && src_len >= last_byte) {
+		*dst_len = src_len - last_byte;
+	} else {
+		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+		       last_byte, (int)src_len);
+		return -EPERM;  /* bad padding */
+	}
+	/*
+	print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+static int ceph_aes_decrypt2(const void *key, int key_len,
+			     void *dst1, size_t *dst1_len,
+			     void *dst2, size_t *dst2_len,
+			     const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[1], sg_out[3];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm };
+	char pad[16];
+	void *iv;
+	int ivsize;
+	int ret;
+	int last_byte;
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	sg_init_table(sg_in, 1);
+	sg_set_buf(sg_in, src, src_len);
+	sg_init_table(sg_out, 3);
+	sg_set_buf(&sg_out[0], dst1, *dst1_len);
+	sg_set_buf(&sg_out[1], dst2, *dst2_len);
+	sg_set_buf(&sg_out[2], pad, sizeof(pad));
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+
+	/*
+	print_hex_dump(KERN_ERR, "dec  key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
+		       src, src_len, 1);
+	*/
+
+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0) {
+		pr_err("ceph_aes_decrypt failed %d\n", ret);
+		return ret;
+	}
+
+	if (src_len <= *dst1_len)
+		last_byte = ((char *)dst1)[src_len - 1];
+	else if (src_len <= *dst1_len + *dst2_len)
+		last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
+	else
+		last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
+	if (last_byte <= 16 && src_len >= last_byte) {
+		src_len -= last_byte;
+	} else {
+		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+		       last_byte, (int)src_len);
+		return -EPERM;  /* bad padding */
+	}
+
+	if (src_len < *dst1_len) {
+		*dst1_len = src_len;
+		*dst2_len = 0;
+	} else {
+		*dst2_len = src_len - *dst1_len;
+	}
+	/*
+	print_hex_dump(KERN_ERR, "dec  out1: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst1, *dst1_len, 1);
+	print_hex_dump(KERN_ERR, "dec  out2: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst2, *dst2_len, 1);
+	*/
+
+	return 0;
+}
+
+
+int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		 const void *src, size_t src_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src_len)
+			return -ERANGE;
+		memcpy(dst, src, src_len);
+		*dst_len = src_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_decrypt(secret->key, secret->len, dst,
+					dst_len, src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_decrypt2(struct ceph_crypto_key *secret,
+			void *dst1, size_t *dst1_len,
+			void *dst2, size_t *dst2_len,
+			const void *src, size_t src_len)
+{
+	size_t t;
+
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst1_len + *dst2_len < src_len)
+			return -ERANGE;
+		t = min(*dst1_len, src_len);
+		memcpy(dst1, src, t);
+		*dst1_len = t;
+		src += t;
+		src_len -= t;
+		if (src_len) {
+			t = min(*dst2_len, src_len);
+			memcpy(dst2, src, t);
+			*dst2_len = t;
+		}
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_decrypt2(secret->key, secret->len,
+					 dst1, dst1_len, dst2, dst2_len,
+					 src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		 const void *src, size_t src_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src_len)
+			return -ERANGE;
+		memcpy(dst, src, src_len);
+		*dst_len = src_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_encrypt(secret->key, secret->len, dst,
+					dst_len, src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		  const void *src1, size_t src1_len,
+		  const void *src2, size_t src2_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src1_len + src2_len)
+			return -ERANGE;
+		memcpy(dst, src1, src1_len);
+		memcpy(dst + src1_len, src2, src2_len);
+		*dst_len = src1_len + src2_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
+					 src1, src1_len, src2, src2_len);
+
+	default:
+		return -EINVAL;
+	}
+}
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
new file mode 100644
index 00000000000..f9eccace592
--- /dev/null
+++ b/net/ceph/crypto.h
@@ -0,0 +1,48 @@
+#ifndef _FS_CEPH_CRYPTO_H
+#define _FS_CEPH_CRYPTO_H
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
+
+/*
+ * cryptographic secret
+ */
+struct ceph_crypto_key {
+	int type;
+	struct ceph_timespec created;
+	int len;
+	void *key;
+};
+
+static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
+{
+	kfree(key->key);
+}
+
+extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
+				  void **p, void *end);
+extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
+				  void **p, void *end);
+extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
+
+/* crypto.c */
+extern int ceph_decrypt(struct ceph_crypto_key *secret,
+			void *dst, size_t *dst_len,
+			const void *src, size_t src_len);
+extern int ceph_encrypt(struct ceph_crypto_key *secret,
+			void *dst, size_t *dst_len,
+			const void *src, size_t src_len);
+extern int ceph_decrypt2(struct ceph_crypto_key *secret,
+			void *dst1, size_t *dst1_len,
+			void *dst2, size_t *dst2_len,
+			const void *src, size_t src_len);
+extern int ceph_encrypt2(struct ceph_crypto_key *secret,
+			 void *dst, size_t *dst_len,
+			 const void *src1, size_t src1_len,
+			 const void *src2, size_t src2_len);
+
+/* armor.c */
+extern int ceph_armor(char *dst, const char *src, const char *end);
+extern int ceph_unarmor(char *dst, const char *src, const char *end);
+
+#endif
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
new file mode 100644
index 00000000000..27d4ea315d1
--- /dev/null
+++ b/net/ceph/debugfs.c
@@ -0,0 +1,267 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+#ifdef CONFIG_DEBUG_FS
+
+/*
+ * Implement /sys/kernel/debug/ceph fun
+ *
+ * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
+ *      .../osdmap      - current osdmap
+ *      .../monmap      - current monmap
+ *      .../osdc        - active osd requests
+ *      .../monc        - mon client state
+ *      .../dentry_lru  - dump contents of dentry lru
+ *      .../caps        - expose cap (reservation) stats
+ *      .../bdi         - symlink to ../../bdi/something
+ */
+
+static struct dentry *ceph_debugfs_dir;
+
+static int monmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+
+	if (client->monc.monmap == NULL)
+		return 0;
+
+	seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+	for (i = 0; i < client->monc.monmap->num_mon; i++) {
+		struct ceph_entity_inst *inst =
+			&client->monc.monmap->mon_inst[i];
+
+		seq_printf(s, "\t%s%lld\t%s\n",
+			   ENTITY_NAME(inst->name),
+			   ceph_pr_addr(&inst->addr.in_addr));
+	}
+	return 0;
+}
+
+static int osdmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+	struct rb_node *n;
+
+	if (client->osdc.osdmap == NULL)
+		return 0;
+	seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+	seq_printf(s, "flags%s%s\n",
+		   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
+		   " NEARFULL" : "",
+		   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
+		   " FULL" : "");
+	for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
+		struct ceph_pg_pool_info *pool =
+			rb_entry(n, struct ceph_pg_pool_info, node);
+		seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
+			   pool->id, pool->v.pg_num, pool->pg_num_mask,
+			   pool->v.lpg_num, pool->lpg_num_mask);
+	}
+	for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
+		struct ceph_entity_addr *addr =
+			&client->osdc.osdmap->osd_addr[i];
+		int state = client->osdc.osdmap->osd_state[i];
+		char sb[64];
+
+		seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+			   i, ceph_pr_addr(&addr->in_addr),
+			   ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
+			   ceph_osdmap_state_str(sb, sizeof(sb), state));
+	}
+	return 0;
+}
+
+static int monc_show(struct seq_file *s, void *p)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_client *monc = &client->monc;
+	struct rb_node *rp;
+
+	mutex_lock(&monc->mutex);
+
+	if (monc->have_mdsmap)
+		seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
+	if (monc->have_osdmap)
+		seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
+	if (monc->want_next_osdmap)
+		seq_printf(s, "want next osdmap\n");
+
+	for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
+		__u16 op;
+		req = rb_entry(rp, struct ceph_mon_generic_request, node);
+		op = le16_to_cpu(req->request->hdr.type);
+		if (op == CEPH_MSG_STATFS)
+			seq_printf(s, "%lld statfs\n", req->tid);
+		else
+			seq_printf(s, "%lld unknown\n", req->tid);
+	}
+
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+static int osdc_show(struct seq_file *s, void *pp)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_osd_client *osdc = &client->osdc;
+	struct rb_node *p;
+
+	mutex_lock(&osdc->request_mutex);
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		struct ceph_osd_request *req;
+		struct ceph_osd_request_head *head;
+		struct ceph_osd_op *op;
+		int num_ops;
+		int opcode, olen;
+		int i;
+
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
+			   req->r_osd ? req->r_osd->o_osd : -1,
+			   le32_to_cpu(req->r_pgid.pool),
+			   le16_to_cpu(req->r_pgid.ps));
+
+		head = req->r_request->front.iov_base;
+		op = (void *)(head + 1);
+
+		num_ops = le16_to_cpu(head->num_ops);
+		olen = le32_to_cpu(head->object_len);
+		seq_printf(s, "%.*s", olen,
+			   (const char *)(head->ops + num_ops));
+
+		if (req->r_reassert_version.epoch)
+			seq_printf(s, "\t%u'%llu",
+			   (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
+			   le64_to_cpu(req->r_reassert_version.version));
+		else
+			seq_printf(s, "\t");
+
+		for (i = 0; i < num_ops; i++) {
+			opcode = le16_to_cpu(op->op);
+			seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
+			op++;
+		}
+
+		seq_printf(s, "\n");
+	}
+	mutex_unlock(&osdc->request_mutex);
+	return 0;
+}
+
+CEPH_DEFINE_SHOW_FUNC(monmap_show)
+CEPH_DEFINE_SHOW_FUNC(osdmap_show)
+CEPH_DEFINE_SHOW_FUNC(monc_show)
+CEPH_DEFINE_SHOW_FUNC(osdc_show)
+
+int ceph_debugfs_init(void)
+{
+	ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
+	if (!ceph_debugfs_dir)
+		return -ENOMEM;
+	return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+	debugfs_remove(ceph_debugfs_dir);
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+	int ret = -ENOMEM;
+	char name[80];
+
+	snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
+		 client->monc.auth->global_id);
+
+	client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
+	if (!client->debugfs_dir)
+		goto out;
+
+	client->monc.debugfs_file = debugfs_create_file("monc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &monc_show_fops);
+	if (!client->monc.debugfs_file)
+		goto out;
+
+	client->osdc.debugfs_file = debugfs_create_file("osdc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &osdc_show_fops);
+	if (!client->osdc.debugfs_file)
+		goto out;
+
+	client->debugfs_monmap = debugfs_create_file("monmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&monmap_show_fops);
+	if (!client->debugfs_monmap)
+		goto out;
+
+	client->debugfs_osdmap = debugfs_create_file("osdmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&osdmap_show_fops);
+	if (!client->debugfs_osdmap)
+		goto out;
+
+	return 0;
+
+out:
+	ceph_debugfs_client_cleanup(client);
+	return ret;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+	debugfs_remove(client->debugfs_osdmap);
+	debugfs_remove(client->debugfs_monmap);
+	debugfs_remove(client->osdc.debugfs_file);
+	debugfs_remove(client->monc.debugfs_file);
+	debugfs_remove(client->debugfs_dir);
+}
+
+#else  /* CONFIG_DEBUG_FS */
+
+int ceph_debugfs_init(void)
+{
+	return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+	return 0;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+}
+
+#endif  /* CONFIG_DEBUG_FS */
+
+EXPORT_SYMBOL(ceph_debugfs_init);
+EXPORT_SYMBOL(ceph_debugfs_cleanup);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
new file mode 100644
index 00000000000..0e8157ee5d4
--- /dev/null
+++ b/net/ceph/messenger.c
@@ -0,0 +1,2453 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/crc32c.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/inet.h>
+#include <linux/kthread.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <net/tcp.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+
+/*
+ * Ceph uses the messenger to exchange ceph_msg messages with other
+ * hosts in the system.  The messenger provides ordered and reliable
+ * delivery.  We tolerate TCP disconnects by reconnecting (with
+ * exponential backoff) in the case of a fault (disconnection, bad
+ * crc, protocol error).  Acks allow sent messages to be discarded by
+ * the sender.
+ */
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key socket_class;
+#endif
+
+
+static void queue_con(struct ceph_connection *con);
+static void con_work(struct work_struct *);
+static void ceph_fault(struct ceph_connection *con);
+
+/*
+ * nicely render a sockaddr as a string.
+ */
+#define MAX_ADDR_STR 20
+#define MAX_ADDR_STR_LEN 60
+static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
+static DEFINE_SPINLOCK(addr_str_lock);
+static int last_addr_str;
+
+const char *ceph_pr_addr(const struct sockaddr_storage *ss)
+{
+	int i;
+	char *s;
+	struct sockaddr_in *in4 = (void *)ss;
+	struct sockaddr_in6 *in6 = (void *)ss;
+
+	spin_lock(&addr_str_lock);
+	i = last_addr_str++;
+	if (last_addr_str == MAX_ADDR_STR)
+		last_addr_str = 0;
+	spin_unlock(&addr_str_lock);
+	s = addr_str[i];
+
+	switch (ss->ss_family) {
+	case AF_INET:
+		snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
+			 (unsigned int)ntohs(in4->sin_port));
+		break;
+
+	case AF_INET6:
+		snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
+			 (unsigned int)ntohs(in6->sin6_port));
+		break;
+
+	default:
+		sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
+	}
+
+	return s;
+}
+EXPORT_SYMBOL(ceph_pr_addr);
+
+static void encode_my_addr(struct ceph_messenger *msgr)
+{
+	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
+	ceph_encode_addr(&msgr->my_enc_addr);
+}
+
+/*
+ * work queue for all reading and writing to/from the socket.
+ */
+struct workqueue_struct *ceph_msgr_wq;
+
+int ceph_msgr_init(void)
+{
+	ceph_msgr_wq = create_workqueue("ceph-msgr");
+	if (IS_ERR(ceph_msgr_wq)) {
+		int ret = PTR_ERR(ceph_msgr_wq);
+		pr_err("msgr_init failed to create workqueue: %d\n", ret);
+		ceph_msgr_wq = NULL;
+		return ret;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ceph_msgr_init);
+
+void ceph_msgr_exit(void)
+{
+	destroy_workqueue(ceph_msgr_wq);
+}
+EXPORT_SYMBOL(ceph_msgr_exit);
+
+void ceph_msgr_flush(void)
+{
+	flush_workqueue(ceph_msgr_wq);
+}
+EXPORT_SYMBOL(ceph_msgr_flush);
+
+
+/*
+ * socket callback functions
+ */
+
+/* data available on socket, or listen socket received a connect */
+static void ceph_data_ready(struct sock *sk, int count_unused)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+	if (sk->sk_state != TCP_CLOSE_WAIT) {
+		dout("ceph_data_ready on %p state = %lu, queueing work\n",
+		     con, con->state);
+		queue_con(con);
+	}
+}
+
+/* socket has buffer space for writing */
+static void ceph_write_space(struct sock *sk)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+
+	/* only queue to workqueue if there is data we want to write. */
+	if (test_bit(WRITE_PENDING, &con->state)) {
+		dout("ceph_write_space %p queueing write work\n", con);
+		queue_con(con);
+	} else {
+		dout("ceph_write_space %p nothing to write\n", con);
+	}
+
+	/* since we have our own write_space, clear the SOCK_NOSPACE flag */
+	clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
+
+/* socket's state has changed */
+static void ceph_state_change(struct sock *sk)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+
+	dout("ceph_state_change %p state = %lu sk_state = %u\n",
+	     con, con->state, sk->sk_state);
+
+	if (test_bit(CLOSED, &con->state))
+		return;
+
+	switch (sk->sk_state) {
+	case TCP_CLOSE:
+		dout("ceph_state_change TCP_CLOSE\n");
+	case TCP_CLOSE_WAIT:
+		dout("ceph_state_change TCP_CLOSE_WAIT\n");
+		if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
+			if (test_bit(CONNECTING, &con->state))
+				con->error_msg = "connection failed";
+			else
+				con->error_msg = "socket closed";
+			queue_con(con);
+		}
+		break;
+	case TCP_ESTABLISHED:
+		dout("ceph_state_change TCP_ESTABLISHED\n");
+		queue_con(con);
+		break;
+	}
+}
+
+/*
+ * set up socket callbacks
+ */
+static void set_sock_callbacks(struct socket *sock,
+			       struct ceph_connection *con)
+{
+	struct sock *sk = sock->sk;
+	sk->sk_user_data = (void *)con;
+	sk->sk_data_ready = ceph_data_ready;
+	sk->sk_write_space = ceph_write_space;
+	sk->sk_state_change = ceph_state_change;
+}
+
+
+/*
+ * socket helpers
+ */
+
+/*
+ * initiate connection to a remote socket.
+ */
+static struct socket *ceph_tcp_connect(struct ceph_connection *con)
+{
+	struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
+	struct socket *sock;
+	int ret;
+
+	BUG_ON(con->sock);
+	ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
+			       IPPROTO_TCP, &sock);
+	if (ret)
+		return ERR_PTR(ret);
+	con->sock = sock;
+	sock->sk->sk_allocation = GFP_NOFS;
+
+#ifdef CONFIG_LOCKDEP
+	lockdep_set_class(&sock->sk->sk_lock, &socket_class);
+#endif
+
+	set_sock_callbacks(sock, con);
+
+	dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
+
+	ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
+				 O_NONBLOCK);
+	if (ret == -EINPROGRESS) {
+		dout("connect %s EINPROGRESS sk_state = %u\n",
+		     ceph_pr_addr(&con->peer_addr.in_addr),
+		     sock->sk->sk_state);
+		ret = 0;
+	}
+	if (ret < 0) {
+		pr_err("connect %s error %d\n",
+		       ceph_pr_addr(&con->peer_addr.in_addr), ret);
+		sock_release(sock);
+		con->sock = NULL;
+		con->error_msg = "connect error";
+	}
+
+	if (ret < 0)
+		return ERR_PTR(ret);
+	return sock;
+}
+
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+	struct kvec iov = {buf, len};
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+	return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+}
+
+/*
+ * write something.  @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+		     size_t kvlen, size_t len, int more)
+{
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+	if (more)
+		msg.msg_flags |= MSG_MORE;
+	else
+		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
+
+	return kernel_sendmsg(sock, &msg, iov, kvlen, len);
+}
+
+
+/*
+ * Shutdown/close the socket for the given connection.
+ */
+static int con_close_socket(struct ceph_connection *con)
+{
+	int rc;
+
+	dout("con_close_socket on %p sock %p\n", con, con->sock);
+	if (!con->sock)
+		return 0;
+	set_bit(SOCK_CLOSED, &con->state);
+	rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
+	sock_release(con->sock);
+	con->sock = NULL;
+	clear_bit(SOCK_CLOSED, &con->state);
+	return rc;
+}
+
+/*
+ * Reset a connection.  Discard all incoming and outgoing messages
+ * and clear *_seq state.
+ */
+static void ceph_msg_remove(struct ceph_msg *msg)
+{
+	list_del_init(&msg->list_head);
+	ceph_msg_put(msg);
+}
+static void ceph_msg_remove_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
+							list_head);
+		ceph_msg_remove(msg);
+	}
+}
+
+static void reset_connection(struct ceph_connection *con)
+{
+	/* reset connection, out_queue, msg_ and connect_seq */
+	/* discard existing out_queue and msg_seq */
+	ceph_msg_remove_list(&con->out_queue);
+	ceph_msg_remove_list(&con->out_sent);
+
+	if (con->in_msg) {
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+	}
+
+	con->connect_seq = 0;
+	con->out_seq = 0;
+	if (con->out_msg) {
+		ceph_msg_put(con->out_msg);
+		con->out_msg = NULL;
+	}
+	con->out_keepalive_pending = false;
+	con->in_seq = 0;
+	con->in_seq_acked = 0;
+}
+
+/*
+ * mark a peer down.  drop any open connections.
+ */
+void ceph_con_close(struct ceph_connection *con)
+{
+	dout("con_close %p peer %s\n", con,
+	     ceph_pr_addr(&con->peer_addr.in_addr));
+	set_bit(CLOSED, &con->state);  /* in case there's queued work */
+	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
+	clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
+	clear_bit(KEEPALIVE_PENDING, &con->state);
+	clear_bit(WRITE_PENDING, &con->state);
+	mutex_lock(&con->mutex);
+	reset_connection(con);
+	con->peer_global_seq = 0;
+	cancel_delayed_work(&con->work);
+	mutex_unlock(&con->mutex);
+	queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_close);
+
+/*
+ * Reopen a closed connection, with a new peer address.
+ */
+void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
+{
+	dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
+	set_bit(OPENING, &con->state);
+	clear_bit(CLOSED, &con->state);
+	memcpy(&con->peer_addr, addr, sizeof(*addr));
+	con->delay = 0;      /* reset backoff memory */
+	queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_open);
+
+/*
+ * return true if this connection ever successfully opened
+ */
+bool ceph_con_opened(struct ceph_connection *con)
+{
+	return con->connect_seq > 0;
+}
+
+/*
+ * generic get/put
+ */
+struct ceph_connection *ceph_con_get(struct ceph_connection *con)
+{
+	dout("con_get %p nref = %d -> %d\n", con,
+	     atomic_read(&con->nref), atomic_read(&con->nref) + 1);
+	if (atomic_inc_not_zero(&con->nref))
+		return con;
+	return NULL;
+}
+
+void ceph_con_put(struct ceph_connection *con)
+{
+	dout("con_put %p nref = %d -> %d\n", con,
+	     atomic_read(&con->nref), atomic_read(&con->nref) - 1);
+	BUG_ON(atomic_read(&con->nref) == 0);
+	if (atomic_dec_and_test(&con->nref)) {
+		BUG_ON(con->sock);
+		kfree(con);
+	}
+}
+
+/*
+ * initialize a new connection.
+ */
+void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
+{
+	dout("con_init %p\n", con);
+	memset(con, 0, sizeof(*con));
+	atomic_set(&con->nref, 1);
+	con->msgr = msgr;
+	mutex_init(&con->mutex);
+	INIT_LIST_HEAD(&con->out_queue);
+	INIT_LIST_HEAD(&con->out_sent);
+	INIT_DELAYED_WORK(&con->work, con_work);
+}
+EXPORT_SYMBOL(ceph_con_init);
+
+
+/*
+ * We maintain a global counter to order connection attempts.  Get
+ * a unique seq greater than @gt.
+ */
+static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+{
+	u32 ret;
+
+	spin_lock(&msgr->global_seq_lock);
+	if (msgr->global_seq < gt)
+		msgr->global_seq = gt;
+	ret = ++msgr->global_seq;
+	spin_unlock(&msgr->global_seq_lock);
+	return ret;
+}
+
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off.  Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con, int v)
+{
+	struct ceph_msg *m = con->out_msg;
+
+	dout("prepare_write_message_footer %p\n", con);
+	con->out_kvec_is_msg = true;
+	con->out_kvec[v].iov_base = &m->footer;
+	con->out_kvec[v].iov_len = sizeof(m->footer);
+	con->out_kvec_bytes += sizeof(m->footer);
+	con->out_kvec_left++;
+	con->out_more = m->more_to_follow;
+	con->out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m;
+	int v = 0;
+
+	con->out_kvec_bytes = 0;
+	con->out_kvec_is_msg = true;
+	con->out_msg_done = false;
+
+	/* Sneak an ack in there first?  If we can get it into the same
+	 * TCP packet that's a good thing. */
+	if (con->in_seq > con->in_seq_acked) {
+		con->in_seq_acked = con->in_seq;
+		con->out_kvec[v].iov_base = &tag_ack;
+		con->out_kvec[v++].iov_len = 1;
+		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+		con->out_kvec[v].iov_base = &con->out_temp_ack;
+		con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
+		con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+	}
+
+	m = list_first_entry(&con->out_queue,
+		       struct ceph_msg, list_head);
+	con->out_msg = m;
+	if (test_bit(LOSSYTX, &con->state)) {
+		list_del_init(&m->list_head);
+	} else {
+		/* put message on sent list */
+		ceph_msg_get(m);
+		list_move_tail(&m->list_head, &con->out_sent);
+	}
+
+	/*
+	 * only assign outgoing seq # if we haven't sent this message
+	 * yet.  if it is requeued, resend with it's original seq.
+	 */
+	if (m->needs_out_seq) {
+		m->hdr.seq = cpu_to_le64(++con->out_seq);
+		m->needs_out_seq = false;
+	}
+
+	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
+	     m, con->out_seq, le16_to_cpu(m->hdr.type),
+	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+	     le32_to_cpu(m->hdr.data_len),
+	     m->nr_pages);
+	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
+
+	/* tag + hdr + front + middle */
+	con->out_kvec[v].iov_base = &tag_msg;
+	con->out_kvec[v++].iov_len = 1;
+	con->out_kvec[v].iov_base = &m->hdr;
+	con->out_kvec[v++].iov_len = sizeof(m->hdr);
+	con->out_kvec[v++] = m->front;
+	if (m->middle)
+		con->out_kvec[v++] = m->middle->vec;
+	con->out_kvec_left = v;
+	con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
+		(m->middle ? m->middle->vec.iov_len : 0);
+	con->out_kvec_cur = con->out_kvec;
+
+	/* fill in crc (except data pages), footer */
+	con->out_msg->hdr.crc =
+		cpu_to_le32(crc32c(0, (void *)&m->hdr,
+				      sizeof(m->hdr) - sizeof(m->hdr.crc)));
+	con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
+	con->out_msg->footer.front_crc =
+		cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
+	if (m->middle)
+		con->out_msg->footer.middle_crc =
+			cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
+					   m->middle->vec.iov_len));
+	else
+		con->out_msg->footer.middle_crc = 0;
+	con->out_msg->footer.data_crc = 0;
+	dout("prepare_write_message front_crc %u data_crc %u\n",
+	     le32_to_cpu(con->out_msg->footer.front_crc),
+	     le32_to_cpu(con->out_msg->footer.middle_crc));
+
+	/* is there a data payload? */
+	if (le32_to_cpu(m->hdr.data_len) > 0) {
+		/* initialize page iterator */
+		con->out_msg_pos.page = 0;
+		if (m->pages)
+			con->out_msg_pos.page_pos =
+				le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+		else
+			con->out_msg_pos.page_pos = 0;
+		con->out_msg_pos.data_pos = 0;
+		con->out_msg_pos.did_page_crc = 0;
+		con->out_more = 1;  /* data + footer will follow */
+	} else {
+		/* no, queue up footer too and be done */
+		prepare_write_message_footer(con, v);
+	}
+
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+	dout("prepare_write_ack %p %llu -> %llu\n", con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	con->out_kvec[0].iov_base = &tag_ack;
+	con->out_kvec[0].iov_len = 1;
+	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con->out_kvec[1].iov_base = &con->out_temp_ack;
+	con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
+	con->out_kvec_left = 2;
+	con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 1;  /* more will follow.. eventually.. */
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+	dout("prepare_write_keepalive %p\n", con);
+	con->out_kvec[0].iov_base = &tag_keepalive;
+	con->out_kvec[0].iov_len = 1;
+	con->out_kvec_left = 1;
+	con->out_kvec_bytes = 1;
+	con->out_kvec_cur = con->out_kvec;
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static void prepare_connect_authorizer(struct ceph_connection *con)
+{
+	void *auth_buf;
+	int auth_len = 0;
+	int auth_protocol = 0;
+
+	mutex_unlock(&con->mutex);
+	if (con->ops->get_authorizer)
+		con->ops->get_authorizer(con, &auth_buf, &auth_len,
+					 &auth_protocol, &con->auth_reply_buf,
+					 &con->auth_reply_buf_len,
+					 con->auth_retry);
+	mutex_lock(&con->mutex);
+
+	con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
+	con->out_connect.authorizer_len = cpu_to_le32(auth_len);
+
+	con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
+	con->out_kvec[con->out_kvec_left].iov_len = auth_len;
+	con->out_kvec_left++;
+	con->out_kvec_bytes += auth_len;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_messenger *msgr,
+				 struct ceph_connection *con)
+{
+	int len = strlen(CEPH_BANNER);
+
+	con->out_kvec[0].iov_base = CEPH_BANNER;
+	con->out_kvec[0].iov_len = len;
+	con->out_kvec[1].iov_base = &msgr->my_enc_addr;
+	con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
+	con->out_kvec_left = 2;
+	con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 0;
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+static void prepare_write_connect(struct ceph_messenger *msgr,
+				  struct ceph_connection *con,
+				  int after_banner)
+{
+	unsigned global_seq = get_global_seq(con->msgr, 0);
+	int proto;
+
+	switch (con->peer_name.type) {
+	case CEPH_ENTITY_TYPE_MON:
+		proto = CEPH_MONC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_OSD:
+		proto = CEPH_OSDC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_MDS:
+		proto = CEPH_MDSC_PROTOCOL;
+		break;
+	default:
+		BUG();
+	}
+
+	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+	     con->connect_seq, global_seq, proto);
+
+	con->out_connect.features = cpu_to_le64(msgr->supported_features);
+	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+	con->out_connect.global_seq = cpu_to_le32(global_seq);
+	con->out_connect.protocol_version = cpu_to_le32(proto);
+	con->out_connect.flags = 0;
+
+	if (!after_banner) {
+		con->out_kvec_left = 0;
+		con->out_kvec_bytes = 0;
+	}
+	con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
+	con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
+	con->out_kvec_left++;
+	con->out_kvec_bytes += sizeof(con->out_connect);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 0;
+	set_bit(WRITE_PENDING, &con->state);
+
+	prepare_connect_authorizer(con);
+}
+
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ *  1 -> done
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
+	while (con->out_kvec_bytes > 0) {
+		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
+				       con->out_kvec_left, con->out_kvec_bytes,
+				       con->out_more);
+		if (ret <= 0)
+			goto out;
+		con->out_kvec_bytes -= ret;
+		if (con->out_kvec_bytes == 0)
+			break;            /* done */
+		while (ret > 0) {
+			if (ret >= con->out_kvec_cur->iov_len) {
+				ret -= con->out_kvec_cur->iov_len;
+				con->out_kvec_cur++;
+				con->out_kvec_left--;
+			} else {
+				con->out_kvec_cur->iov_len -= ret;
+				con->out_kvec_cur->iov_base += ret;
+				ret = 0;
+				break;
+			}
+		}
+	}
+	con->out_kvec_left = 0;
+	con->out_kvec_is_msg = false;
+	ret = 1;
+out:
+	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+	     con->out_kvec_bytes, con->out_kvec_left, ret);
+	return ret;  /* done! */
+}
+
+#ifdef CONFIG_BLOCK
+static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
+{
+	if (!bio) {
+		*iter = NULL;
+		*seg = 0;
+		return;
+	}
+	*iter = bio;
+	*seg = bio->bi_idx;
+}
+
+static void iter_bio_next(struct bio **bio_iter, int *seg)
+{
+	if (*bio_iter == NULL)
+		return;
+
+	BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
+
+	(*seg)++;
+	if (*seg == (*bio_iter)->bi_vcnt)
+		init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
+}
+#endif
+
+/*
+ * Write as much message data payload as we can.  If we finish, queue
+ * up the footer.
+ *  1 -> done, footer is now queued in out_kvec[].
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_msg_pages(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+	unsigned data_len = le32_to_cpu(msg->hdr.data_len);
+	size_t len;
+	int crc = con->msgr->nocrc;
+	int ret;
+	int total_max_write;
+	int in_trail = 0;
+	size_t trail_len = (msg->trail ? msg->trail->length : 0);
+
+	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
+	     con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
+	     con->out_msg_pos.page_pos);
+
+#ifdef CONFIG_BLOCK
+	if (msg->bio && !msg->bio_iter)
+		init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
+#endif
+
+	while (data_len > con->out_msg_pos.data_pos) {
+		struct page *page = NULL;
+		void *kaddr = NULL;
+		int max_write = PAGE_SIZE;
+		int page_shift = 0;
+
+		total_max_write = data_len - trail_len -
+			con->out_msg_pos.data_pos;
+
+		/*
+		 * if we are calculating the data crc (the default), we need
+		 * to map the page.  if our pages[] has been revoked, use the
+		 * zero page.
+		 */
+
+		/* have we reached the trail part of the data? */
+		if (con->out_msg_pos.data_pos >= data_len - trail_len) {
+			in_trail = 1;
+
+			total_max_write = data_len - con->out_msg_pos.data_pos;
+
+			page = list_first_entry(&msg->trail->head,
+						struct page, lru);
+			if (crc)
+				kaddr = kmap(page);
+			max_write = PAGE_SIZE;
+		} else if (msg->pages) {
+			page = msg->pages[con->out_msg_pos.page];
+			if (crc)
+				kaddr = kmap(page);
+		} else if (msg->pagelist) {
+			page = list_first_entry(&msg->pagelist->head,
+						struct page, lru);
+			if (crc)
+				kaddr = kmap(page);
+#ifdef CONFIG_BLOCK
+		} else if (msg->bio) {
+			struct bio_vec *bv;
+
+			bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
+			page = bv->bv_page;
+			page_shift = bv->bv_offset;
+			if (crc)
+				kaddr = kmap(page) + page_shift;
+			max_write = bv->bv_len;
+#endif
+		} else {
+			page = con->msgr->zero_page;
+			if (crc)
+				kaddr = page_address(con->msgr->zero_page);
+		}
+		len = min_t(int, max_write - con->out_msg_pos.page_pos,
+			    total_max_write);
+
+		if (crc && !con->out_msg_pos.did_page_crc) {
+			void *base = kaddr + con->out_msg_pos.page_pos;
+			u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
+
+			BUG_ON(kaddr == NULL);
+			con->out_msg->footer.data_crc =
+				cpu_to_le32(crc32c(tmpcrc, base, len));
+			con->out_msg_pos.did_page_crc = 1;
+		}
+		ret = kernel_sendpage(con->sock, page,
+				      con->out_msg_pos.page_pos + page_shift,
+				      len,
+				      MSG_DONTWAIT | MSG_NOSIGNAL |
+				      MSG_MORE);
+
+		if (crc &&
+		    (msg->pages || msg->pagelist || msg->bio || in_trail))
+			kunmap(page);
+
+		if (ret <= 0)
+			goto out;
+
+		con->out_msg_pos.data_pos += ret;
+		con->out_msg_pos.page_pos += ret;
+		if (ret == len) {
+			con->out_msg_pos.page_pos = 0;
+			con->out_msg_pos.page++;
+			con->out_msg_pos.did_page_crc = 0;
+			if (in_trail)
+				list_move_tail(&page->lru,
+					       &msg->trail->head);
+			else if (msg->pagelist)
+				list_move_tail(&page->lru,
+					       &msg->pagelist->head);
+#ifdef CONFIG_BLOCK
+			else if (msg->bio)
+				iter_bio_next(&msg->bio_iter, &msg->bio_seg);
+#endif
+		}
+	}
+
+	dout("write_partial_msg_pages %p msg %p done\n", con, msg);
+
+	/* prepare and queue up footer, too */
+	if (!crc)
+		con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+	con->out_kvec_bytes = 0;
+	con->out_kvec_left = 0;
+	con->out_kvec_cur = con->out_kvec;
+	prepare_write_message_footer(con, 0);
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+	int ret;
+
+	while (con->out_skip > 0) {
+		struct kvec iov = {
+			.iov_base = page_address(con->msgr->zero_page),
+			.iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
+		};
+
+		ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
+		if (ret <= 0)
+			goto out;
+		con->out_skip -= ret;
+	}
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+	dout("prepare_read_banner %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+	dout("prepare_read_connect %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_ack %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+	dout("prepare_read_tag %p\n", con);
+	con->in_base_pos = 0;
+	con->in_tag = CEPH_MSGR_TAG_READY;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+	dout("prepare_read_message %p\n", con);
+	BUG_ON(con->in_msg != NULL);
+	con->in_base_pos = 0;
+	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+	return 0;
+}
+
+
+static int read_partial(struct ceph_connection *con,
+			int *to, int size, void *object)
+{
+	*to += size;
+	while (con->in_base_pos < *to) {
+		int left = *to - con->in_base_pos;
+		int have = size - left;
+		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+	}
+	return 1;
+}
+
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+	int ret, to = 0;
+
+	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+
+	/* peer's banner */
+	ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
+			   &con->actual_peer_addr);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
+			   &con->peer_addr_for_me);
+	if (ret <= 0)
+		goto out;
+out:
+	return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+	int ret, to = 0;
+
+	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+
+	ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
+			   con->auth_reply_buf);
+	if (ret <= 0)
+		goto out;
+
+	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+	     con, (int)con->in_reply.tag,
+	     le32_to_cpu(con->in_reply.connect_seq),
+	     le32_to_cpu(con->in_reply.global_seq));
+out:
+	return ret;
+
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+		pr_err("connect to %s got bad banner\n",
+		       ceph_pr_addr(&con->peer_addr.in_addr));
+		con->error_msg = "protocol error, bad banner";
+		return -1;
+	}
+	return 0;
+}
+
+static bool addr_is_blank(struct sockaddr_storage *ss)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
+	case AF_INET6:
+		return
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
+	}
+	return false;
+}
+
+static int addr_port(struct sockaddr_storage *ss)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		return ntohs(((struct sockaddr_in *)ss)->sin_port);
+	case AF_INET6:
+		return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
+	}
+	return 0;
+}
+
+static void addr_set_port(struct sockaddr_storage *ss, int p)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		((struct sockaddr_in *)ss)->sin_port = htons(p);
+	case AF_INET6:
+		((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
+	}
+}
+
+/*
+ * Parse an ip[:port] list into an addr array.  Use the default
+ * monitor port if a port isn't specified.
+ */
+int ceph_parse_ips(const char *c, const char *end,
+		   struct ceph_entity_addr *addr,
+		   int max_count, int *count)
+{
+	int i;
+	const char *p = c;
+
+	dout("parse_ips on '%.*s'\n", (int)(end-c), c);
+	for (i = 0; i < max_count; i++) {
+		const char *ipend;
+		struct sockaddr_storage *ss = &addr[i].in_addr;
+		struct sockaddr_in *in4 = (void *)ss;
+		struct sockaddr_in6 *in6 = (void *)ss;
+		int port;
+		char delim = ',';
+
+		if (*p == '[') {
+			delim = ']';
+			p++;
+		}
+
+		memset(ss, 0, sizeof(*ss));
+		if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
+			     delim, &ipend))
+			ss->ss_family = AF_INET;
+		else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+				  delim, &ipend))
+			ss->ss_family = AF_INET6;
+		else
+			goto bad;
+		p = ipend;
+
+		if (delim == ']') {
+			if (*p != ']') {
+				dout("missing matching ']'\n");
+				goto bad;
+			}
+			p++;
+		}
+
+		/* port? */
+		if (p < end && *p == ':') {
+			port = 0;
+			p++;
+			while (p < end && *p >= '0' && *p <= '9') {
+				port = (port * 10) + (*p - '0');
+				p++;
+			}
+			if (port > 65535 || port == 0)
+				goto bad;
+		} else {
+			port = CEPH_MON_PORT;
+		}
+
+		addr_set_port(ss, port);
+
+		dout("parse_ips got %s\n", ceph_pr_addr(ss));
+
+		if (p == end)
+			break;
+		if (*p != ',')
+			goto bad;
+		p++;
+	}
+
+	if (p != end)
+		goto bad;
+
+	if (count)
+		*count = i + 1;
+	return 0;
+
+bad:
+	pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_parse_ips);
+
+static int process_banner(struct ceph_connection *con)
+{
+	dout("process_banner on %p\n", con);
+
+	if (verify_hello(con) < 0)
+		return -1;
+
+	ceph_decode_addr(&con->actual_peer_addr);
+	ceph_decode_addr(&con->peer_addr_for_me);
+
+	/*
+	 * Make sure the other end is who we wanted.  note that the other
+	 * end may not yet know their ip address, so if it's 0.0.0.0, give
+	 * them the benefit of the doubt.
+	 */
+	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+		   sizeof(con->peer_addr)) != 0 &&
+	    !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
+	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+		pr_warning("wrong peer, want %s/%d, got %s/%d\n",
+			   ceph_pr_addr(&con->peer_addr.in_addr),
+			   (int)le32_to_cpu(con->peer_addr.nonce),
+			   ceph_pr_addr(&con->actual_peer_addr.in_addr),
+			   (int)le32_to_cpu(con->actual_peer_addr.nonce));
+		con->error_msg = "wrong peer at address";
+		return -1;
+	}
+
+	/*
+	 * did we learn our address?
+	 */
+	if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
+		int port = addr_port(&con->msgr->inst.addr.in_addr);
+
+		memcpy(&con->msgr->inst.addr.in_addr,
+		       &con->peer_addr_for_me.in_addr,
+		       sizeof(con->peer_addr_for_me.in_addr));
+		addr_set_port(&con->msgr->inst.addr.in_addr, port);
+		encode_my_addr(con->msgr);
+		dout("process_banner learned my addr is %s\n",
+		     ceph_pr_addr(&con->msgr->inst.addr.in_addr));
+	}
+
+	set_bit(NEGOTIATING, &con->state);
+	prepare_read_connect(con);
+	return 0;
+}
+
+static void fail_protocol(struct ceph_connection *con)
+{
+	reset_connection(con);
+	set_bit(CLOSED, &con->state);  /* in case there's queued work */
+
+	mutex_unlock(&con->mutex);
+	if (con->ops->bad_proto)
+		con->ops->bad_proto(con);
+	mutex_lock(&con->mutex);
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+	u64 sup_feat = con->msgr->supported_features;
+	u64 req_feat = con->msgr->required_features;
+	u64 server_feat = le64_to_cpu(con->in_reply.features);
+
+	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+
+	switch (con->in_reply.tag) {
+	case CEPH_MSGR_TAG_FEATURES:
+		pr_err("%s%lld %s feature set mismatch,"
+		       " my %llx < server's %llx, missing %llx\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr.in_addr),
+		       sup_feat, server_feat, server_feat & ~sup_feat);
+		con->error_msg = "missing required protocol features";
+		fail_protocol(con);
+		return -1;
+
+	case CEPH_MSGR_TAG_BADPROTOVER:
+		pr_err("%s%lld %s protocol version mismatch,"
+		       " my %d != server's %d\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr.in_addr),
+		       le32_to_cpu(con->out_connect.protocol_version),
+		       le32_to_cpu(con->in_reply.protocol_version));
+		con->error_msg = "protocol version mismatch";
+		fail_protocol(con);
+		return -1;
+
+	case CEPH_MSGR_TAG_BADAUTHORIZER:
+		con->auth_retry++;
+		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+		     con->auth_retry);
+		if (con->auth_retry == 2) {
+			con->error_msg = "connect authorization failure";
+			reset_connection(con);
+			set_bit(CLOSED, &con->state);
+			return -1;
+		}
+		con->auth_retry = 1;
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RESETSESSION:
+		/*
+		 * If we connected with a large connect_seq but the peer
+		 * has no record of a session with us (no connection, or
+		 * connect_seq == 0), they will send RESETSESION to indicate
+		 * that they must have reset their session, and may have
+		 * dropped messages.
+		 */
+		dout("process_connect got RESET peer seq %u\n",
+		     le32_to_cpu(con->in_connect.connect_seq));
+		pr_err("%s%lld %s connection reset\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr.in_addr));
+		reset_connection(con);
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+
+		/* Tell ceph about it. */
+		mutex_unlock(&con->mutex);
+		pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
+		if (con->ops->peer_reset)
+			con->ops->peer_reset(con);
+		mutex_lock(&con->mutex);
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_SESSION:
+		/*
+		 * If we sent a smaller connect_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
+		     le32_to_cpu(con->out_connect.connect_seq),
+		     le32_to_cpu(con->in_connect.connect_seq));
+		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_GLOBAL:
+		/*
+		 * If we sent a smaller global_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_connect.global_seq));
+		get_global_seq(con->msgr,
+			       le32_to_cpu(con->in_connect.global_seq));
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_READY:
+		if (req_feat & ~server_feat) {
+			pr_err("%s%lld %s protocol feature mismatch,"
+			       " my required %llx > server's %llx, need %llx\n",
+			       ENTITY_NAME(con->peer_name),
+			       ceph_pr_addr(&con->peer_addr.in_addr),
+			       req_feat, server_feat, req_feat & ~server_feat);
+			con->error_msg = "missing required protocol features";
+			fail_protocol(con);
+			return -1;
+		}
+		clear_bit(CONNECTING, &con->state);
+		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+		con->connect_seq++;
+		con->peer_features = server_feat;
+		dout("process_connect got READY gseq %d cseq %d (%d)\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_reply.connect_seq),
+		     con->connect_seq);
+		WARN_ON(con->connect_seq !=
+			le32_to_cpu(con->in_reply.connect_seq));
+
+		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+			set_bit(LOSSYTX, &con->state);
+
+		prepare_read_tag(con);
+		break;
+
+	case CEPH_MSGR_TAG_WAIT:
+		/*
+		 * If there is a connection race (we are opening
+		 * connections to each other), one of us may just have
+		 * to WAIT.  This shouldn't happen if we are the
+		 * client.
+		 */
+		pr_err("process_connect peer connecting WAIT\n");
+
+	default:
+		pr_err("connect protocol error, will retry\n");
+		con->error_msg = "protocol error, garbage tag during connect";
+		return -1;
+	}
+	return 0;
+}
+
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+	int to = 0;
+
+	return read_partial(con, &to, sizeof(con->in_temp_ack),
+			    &con->in_temp_ack);
+}
+
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+	struct ceph_msg *m;
+	u64 ack = le64_to_cpu(con->in_temp_ack);
+	u64 seq;
+
+	while (!list_empty(&con->out_sent)) {
+		m = list_first_entry(&con->out_sent, struct ceph_msg,
+				     list_head);
+		seq = le64_to_cpu(m->hdr.seq);
+		if (seq > ack)
+			break;
+		dout("got ack for seq %llu type %d at %p\n", seq,
+		     le16_to_cpu(m->hdr.type), m);
+		ceph_msg_remove(m);
+	}
+	prepare_read_tag(con);
+}
+
+
+
+
+static int read_partial_message_section(struct ceph_connection *con,
+					struct kvec *section,
+					unsigned int sec_len, u32 *crc)
+{
+	int ret, left;
+
+	BUG_ON(!section);
+
+	while (section->iov_len < sec_len) {
+		BUG_ON(section->iov_base == NULL);
+		left = sec_len - section->iov_len;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+				       section->iov_len, left);
+		if (ret <= 0)
+			return ret;
+		section->iov_len += ret;
+		if (section->iov_len == sec_len)
+			*crc = crc32c(0, section->iov_base,
+				      section->iov_len);
+	}
+
+	return 1;
+}
+
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+				struct ceph_msg_header *hdr,
+				int *skip);
+
+
+static int read_partial_message_pages(struct ceph_connection *con,
+				      struct page **pages,
+				      unsigned data_len, int datacrc)
+{
+	void *p;
+	int ret;
+	int left;
+
+	left = min((int)(data_len - con->in_msg_pos.data_pos),
+		   (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
+	/* (page) data */
+	BUG_ON(pages == NULL);
+	p = kmap(pages[con->in_msg_pos.page]);
+	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+			       left);
+	if (ret > 0 && datacrc)
+		con->in_data_crc =
+			crc32c(con->in_data_crc,
+				  p + con->in_msg_pos.page_pos, ret);
+	kunmap(pages[con->in_msg_pos.page]);
+	if (ret <= 0)
+		return ret;
+	con->in_msg_pos.data_pos += ret;
+	con->in_msg_pos.page_pos += ret;
+	if (con->in_msg_pos.page_pos == PAGE_SIZE) {
+		con->in_msg_pos.page_pos = 0;
+		con->in_msg_pos.page++;
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_BLOCK
+static int read_partial_message_bio(struct ceph_connection *con,
+				    struct bio **bio_iter, int *bio_seg,
+				    unsigned data_len, int datacrc)
+{
+	struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
+	void *p;
+	int ret, left;
+
+	if (IS_ERR(bv))
+		return PTR_ERR(bv);
+
+	left = min((int)(data_len - con->in_msg_pos.data_pos),
+		   (int)(bv->bv_len - con->in_msg_pos.page_pos));
+
+	p = kmap(bv->bv_page) + bv->bv_offset;
+
+	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+			       left);
+	if (ret > 0 && datacrc)
+		con->in_data_crc =
+			crc32c(con->in_data_crc,
+				  p + con->in_msg_pos.page_pos, ret);
+	kunmap(bv->bv_page);
+	if (ret <= 0)
+		return ret;
+	con->in_msg_pos.data_pos += ret;
+	con->in_msg_pos.page_pos += ret;
+	if (con->in_msg_pos.page_pos == bv->bv_len) {
+		con->in_msg_pos.page_pos = 0;
+		iter_bio_next(bio_iter, bio_seg);
+	}
+
+	return ret;
+}
+#endif
+
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m = con->in_msg;
+	int ret;
+	int to, left;
+	unsigned front_len, middle_len, data_len, data_off;
+	int datacrc = con->msgr->nocrc;
+	int skip;
+	u64 seq;
+
+	dout("read_partial_message con %p msg %p\n", con, m);
+
+	/* header */
+	while (con->in_base_pos < sizeof(con->in_hdr)) {
+		left = sizeof(con->in_hdr) - con->in_base_pos;
+		ret = ceph_tcp_recvmsg(con->sock,
+				       (char *)&con->in_hdr + con->in_base_pos,
+				       left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+		if (con->in_base_pos == sizeof(con->in_hdr)) {
+			u32 crc = crc32c(0, (void *)&con->in_hdr,
+				 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
+			if (crc != le32_to_cpu(con->in_hdr.crc)) {
+				pr_err("read_partial_message bad hdr "
+				       " crc %u != expected %u\n",
+				       crc, con->in_hdr.crc);
+				return -EBADMSG;
+			}
+		}
+	}
+	front_len = le32_to_cpu(con->in_hdr.front_len);
+	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+		return -EIO;
+	middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	if (middle_len > CEPH_MSG_MAX_DATA_LEN)
+		return -EIO;
+	data_len = le32_to_cpu(con->in_hdr.data_len);
+	if (data_len > CEPH_MSG_MAX_DATA_LEN)
+		return -EIO;
+	data_off = le16_to_cpu(con->in_hdr.data_off);
+
+	/* verify seq# */
+	seq = le64_to_cpu(con->in_hdr.seq);
+	if ((s64)seq - (s64)con->in_seq < 1) {
+		pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
+			ENTITY_NAME(con->peer_name),
+			ceph_pr_addr(&con->peer_addr.in_addr),
+			seq, con->in_seq + 1);
+		con->in_base_pos = -front_len - middle_len - data_len -
+			sizeof(m->footer);
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		con->in_seq++;
+		return 0;
+	} else if ((s64)seq - (s64)con->in_seq > 1) {
+		pr_err("read_partial_message bad seq %lld expected %lld\n",
+		       seq, con->in_seq + 1);
+		con->error_msg = "bad message sequence # for incoming message";
+		return -EBADMSG;
+	}
+
+	/* allocate message? */
+	if (!con->in_msg) {
+		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+		     con->in_hdr.front_len, con->in_hdr.data_len);
+		skip = 0;
+		con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
+		if (skip) {
+			/* skip this message */
+			dout("alloc_msg said skip message\n");
+			BUG_ON(con->in_msg);
+			con->in_base_pos = -front_len - middle_len - data_len -
+				sizeof(m->footer);
+			con->in_tag = CEPH_MSGR_TAG_READY;
+			con->in_seq++;
+			return 0;
+		}
+		if (!con->in_msg) {
+			con->error_msg =
+				"error allocating memory for incoming message";
+			return -ENOMEM;
+		}
+		m = con->in_msg;
+		m->front.iov_len = 0;    /* haven't read it yet */
+		if (m->middle)
+			m->middle->vec.iov_len = 0;
+
+		con->in_msg_pos.page = 0;
+		if (m->pages)
+			con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+		else
+			con->in_msg_pos.page_pos = 0;
+		con->in_msg_pos.data_pos = 0;
+	}
+
+	/* front */
+	ret = read_partial_message_section(con, &m->front, front_len,
+					   &con->in_front_crc);
+	if (ret <= 0)
+		return ret;
+
+	/* middle */
+	if (m->middle) {
+		ret = read_partial_message_section(con, &m->middle->vec,
+						   middle_len,
+						   &con->in_middle_crc);
+		if (ret <= 0)
+			return ret;
+	}
+#ifdef CONFIG_BLOCK
+	if (m->bio && !m->bio_iter)
+		init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
+#endif
+
+	/* (page) data */
+	while (con->in_msg_pos.data_pos < data_len) {
+		if (m->pages) {
+			ret = read_partial_message_pages(con, m->pages,
+						 data_len, datacrc);
+			if (ret <= 0)
+				return ret;
+#ifdef CONFIG_BLOCK
+		} else if (m->bio) {
+
+			ret = read_partial_message_bio(con,
+						 &m->bio_iter, &m->bio_seg,
+						 data_len, datacrc);
+			if (ret <= 0)
+				return ret;
+#endif
+		} else {
+			BUG_ON(1);
+		}
+	}
+
+	/* footer */
+	to = sizeof(m->hdr) + sizeof(m->footer);
+	while (con->in_base_pos < to) {
+		left = to - con->in_base_pos;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
+				       (con->in_base_pos - sizeof(m->hdr)),
+				       left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+	}
+	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+	     m, front_len, m->footer.front_crc, middle_len,
+	     m->footer.middle_crc, data_len, m->footer.data_crc);
+
+	/* crc ok? */
+	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+		pr_err("read_partial_message %p front crc %u != exp. %u\n",
+		       m, con->in_front_crc, m->footer.front_crc);
+		return -EBADMSG;
+	}
+	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+		pr_err("read_partial_message %p middle crc %u != exp %u\n",
+		       m, con->in_middle_crc, m->footer.middle_crc);
+		return -EBADMSG;
+	}
+	if (datacrc &&
+	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+		return -EBADMSG;
+	}
+
+	return 1; /* done! */
+}
+
+/*
+ * Process message.  This happens in the worker thread.  The callback should
+ * be careful not to do anything that waits on other incoming messages or it
+ * may deadlock.
+ */
+static void process_message(struct ceph_connection *con)
+{
+	struct ceph_msg *msg;
+
+	msg = con->in_msg;
+	con->in_msg = NULL;
+
+	/* if first message, set peer_name */
+	if (con->peer_name.type == 0)
+		con->peer_name = msg->hdr.src;
+
+	con->in_seq++;
+	mutex_unlock(&con->mutex);
+
+	dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+	     msg, le64_to_cpu(msg->hdr.seq),
+	     ENTITY_NAME(msg->hdr.src),
+	     le16_to_cpu(msg->hdr.type),
+	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+	     le32_to_cpu(msg->hdr.front_len),
+	     le32_to_cpu(msg->hdr.data_len),
+	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+	con->ops->dispatch(con, msg);
+
+	mutex_lock(&con->mutex);
+	prepare_read_tag(con);
+}
+
+
+/*
+ * Write something to the socket.  Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+static int try_write(struct ceph_connection *con)
+{
+	struct ceph_messenger *msgr = con->msgr;
+	int ret = 1;
+
+	dout("try_write start %p state %lu nref %d\n", con, con->state,
+	     atomic_read(&con->nref));
+
+more:
+	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+
+	/* open the socket first? */
+	if (con->sock == NULL) {
+		/*
+		 * if we were STANDBY and are reconnecting _this_
+		 * connection, bump connect_seq now.  Always bump
+		 * global_seq.
+		 */
+		if (test_and_clear_bit(STANDBY, &con->state))
+			con->connect_seq++;
+
+		prepare_write_banner(msgr, con);
+		prepare_write_connect(msgr, con, 1);
+		prepare_read_banner(con);
+		set_bit(CONNECTING, &con->state);
+		clear_bit(NEGOTIATING, &con->state);
+
+		BUG_ON(con->in_msg);
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		dout("try_write initiating connect on %p new state %lu\n",
+		     con, con->state);
+		con->sock = ceph_tcp_connect(con);
+		if (IS_ERR(con->sock)) {
+			con->sock = NULL;
+			con->error_msg = "connect error";
+			ret = -1;
+			goto out;
+		}
+	}
+
+more_kvec:
+	/* kvec data queued? */
+	if (con->out_skip) {
+		ret = write_partial_skip(con);
+		if (ret <= 0)
+			goto done;
+		if (ret < 0) {
+			dout("try_write write_partial_skip err %d\n", ret);
+			goto done;
+		}
+	}
+	if (con->out_kvec_left) {
+		ret = write_partial_kvec(con);
+		if (ret <= 0)
+			goto done;
+	}
+
+	/* msg pages? */
+	if (con->out_msg) {
+		if (con->out_msg_done) {
+			ceph_msg_put(con->out_msg);
+			con->out_msg = NULL;   /* we're done with this one */
+			goto do_next;
+		}
+
+		ret = write_partial_msg_pages(con);
+		if (ret == 1)
+			goto more_kvec;  /* we need to send the footer, too! */
+		if (ret == 0)
+			goto done;
+		if (ret < 0) {
+			dout("try_write write_partial_msg_pages err %d\n",
+			     ret);
+			goto done;
+		}
+	}
+
+do_next:
+	if (!test_bit(CONNECTING, &con->state)) {
+		/* is anything else pending? */
+		if (!list_empty(&con->out_queue)) {
+			prepare_write_message(con);
+			goto more;
+		}
+		if (con->in_seq > con->in_seq_acked) {
+			prepare_write_ack(con);
+			goto more;
+		}
+		if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
+			prepare_write_keepalive(con);
+			goto more;
+		}
+	}
+
+	/* Nothing to do! */
+	clear_bit(WRITE_PENDING, &con->state);
+	dout("try_write nothing else to write.\n");
+done:
+	ret = 0;
+out:
+	dout("try_write done on %p\n", con);
+	return ret;
+}
+
+
+
+/*
+ * Read what we can from the socket.
+ */
+static int try_read(struct ceph_connection *con)
+{
+	int ret = -1;
+
+	if (!con->sock)
+		return 0;
+
+	if (test_bit(STANDBY, &con->state))
+		return 0;
+
+	dout("try_read start on %p\n", con);
+
+more:
+	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
+	     con->in_base_pos);
+	if (test_bit(CONNECTING, &con->state)) {
+		if (!test_bit(NEGOTIATING, &con->state)) {
+			dout("try_read connecting\n");
+			ret = read_partial_banner(con);
+			if (ret <= 0)
+				goto done;
+			if (process_banner(con) < 0) {
+				ret = -1;
+				goto out;
+			}
+		}
+		ret = read_partial_connect(con);
+		if (ret <= 0)
+			goto done;
+		if (process_connect(con) < 0) {
+			ret = -1;
+			goto out;
+		}
+		goto more;
+	}
+
+	if (con->in_base_pos < 0) {
+		/*
+		 * skipping + discarding content.
+		 *
+		 * FIXME: there must be a better way to do this!
+		 */
+		static char buf[1024];
+		int skip = min(1024, -con->in_base_pos);
+		dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
+		ret = ceph_tcp_recvmsg(con->sock, buf, skip);
+		if (ret <= 0)
+			goto done;
+		con->in_base_pos += ret;
+		if (con->in_base_pos)
+			goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_READY) {
+		/*
+		 * what's next?
+		 */
+		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+		if (ret <= 0)
+			goto done;
+		dout("try_read got tag %d\n", (int)con->in_tag);
+		switch (con->in_tag) {
+		case CEPH_MSGR_TAG_MSG:
+			prepare_read_message(con);
+			break;
+		case CEPH_MSGR_TAG_ACK:
+			prepare_read_ack(con);
+			break;
+		case CEPH_MSGR_TAG_CLOSE:
+			set_bit(CLOSED, &con->state);   /* fixme */
+			goto done;
+		default:
+			goto bad_tag;
+		}
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+		ret = read_partial_message(con);
+		if (ret <= 0) {
+			switch (ret) {
+			case -EBADMSG:
+				con->error_msg = "bad crc";
+				ret = -EIO;
+				goto out;
+			case -EIO:
+				con->error_msg = "io error";
+				goto out;
+			default:
+				goto done;
+			}
+		}
+		if (con->in_tag == CEPH_MSGR_TAG_READY)
+			goto more;
+		process_message(con);
+		goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_ACK) {
+		ret = read_partial_ack(con);
+		if (ret <= 0)
+			goto done;
+		process_ack(con);
+		goto more;
+	}
+
+done:
+	ret = 0;
+out:
+	dout("try_read done on %p\n", con);
+	return ret;
+
+bad_tag:
+	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+	con->error_msg = "protocol error, garbage tag";
+	ret = -1;
+	goto out;
+}
+
+
+/*
+ * Atomically queue work on a connection.  Bump @con reference to
+ * avoid races with connection teardown.
+ *
+ * There is some trickery going on with QUEUED and BUSY because we
+ * only want a _single_ thread operating on each connection at any
+ * point in time, but we want to use all available CPUs.
+ *
+ * The worker thread only proceeds if it can atomically set BUSY.  It
+ * clears QUEUED and does it's thing.  When it thinks it's done, it
+ * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
+ * (tries again to set BUSY).
+ *
+ * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
+ * try to queue work.  If that fails (work is already queued, or BUSY)
+ * we give up (work also already being done or is queued) but leave QUEUED
+ * set so that the worker thread will loop if necessary.
+ */
+static void queue_con(struct ceph_connection *con)
+{
+	if (test_bit(DEAD, &con->state)) {
+		dout("queue_con %p ignoring: DEAD\n",
+		     con);
+		return;
+	}
+
+	if (!con->ops->get(con)) {
+		dout("queue_con %p ref count 0\n", con);
+		return;
+	}
+
+	set_bit(QUEUED, &con->state);
+	if (test_bit(BUSY, &con->state)) {
+		dout("queue_con %p - already BUSY\n", con);
+		con->ops->put(con);
+	} else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
+		dout("queue_con %p - already queued\n", con);
+		con->ops->put(con);
+	} else {
+		dout("queue_con %p\n", con);
+	}
+}
+
+/*
+ * Do some work on a connection.  Drop a connection ref when we're done.
+ */
+static void con_work(struct work_struct *work)
+{
+	struct ceph_connection *con = container_of(work, struct ceph_connection,
+						   work.work);
+	int backoff = 0;
+
+more:
+	if (test_and_set_bit(BUSY, &con->state) != 0) {
+		dout("con_work %p BUSY already set\n", con);
+		goto out;
+	}
+	dout("con_work %p start, clearing QUEUED\n", con);
+	clear_bit(QUEUED, &con->state);
+
+	mutex_lock(&con->mutex);
+
+	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
+		dout("con_work CLOSED\n");
+		con_close_socket(con);
+		goto done;
+	}
+	if (test_and_clear_bit(OPENING, &con->state)) {
+		/* reopen w/ new peer */
+		dout("con_work OPENING\n");
+		con_close_socket(con);
+	}
+
+	if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
+	    try_read(con) < 0 ||
+	    try_write(con) < 0) {
+		mutex_unlock(&con->mutex);
+		backoff = 1;
+		ceph_fault(con);     /* error/fault path */
+		goto done_unlocked;
+	}
+
+done:
+	mutex_unlock(&con->mutex);
+
+done_unlocked:
+	clear_bit(BUSY, &con->state);
+	dout("con->state=%lu\n", con->state);
+	if (test_bit(QUEUED, &con->state)) {
+		if (!backoff || test_bit(OPENING, &con->state)) {
+			dout("con_work %p QUEUED reset, looping\n", con);
+			goto more;
+		}
+		dout("con_work %p QUEUED reset, but just faulted\n", con);
+		clear_bit(QUEUED, &con->state);
+	}
+	dout("con_work %p done\n", con);
+
+out:
+	con->ops->put(con);
+}
+
+
+/*
+ * Generic error/fault handler.  A retry mechanism is used with
+ * exponential backoff
+ */
+static void ceph_fault(struct ceph_connection *con)
+{
+	pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+	       ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
+	dout("fault %p state %lu to peer %s\n",
+	     con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
+
+	if (test_bit(LOSSYTX, &con->state)) {
+		dout("fault on LOSSYTX channel\n");
+		goto out;
+	}
+
+	mutex_lock(&con->mutex);
+	if (test_bit(CLOSED, &con->state))
+		goto out_unlock;
+
+	con_close_socket(con);
+
+	if (con->in_msg) {
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+	}
+
+	/* Requeue anything that hasn't been acked */
+	list_splice_init(&con->out_sent, &con->out_queue);
+
+	/* If there are no messages in the queue, place the connection
+	 * in a STANDBY state (i.e., don't try to reconnect just yet). */
+	if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
+		dout("fault setting STANDBY\n");
+		set_bit(STANDBY, &con->state);
+	} else {
+		/* retry after a delay. */
+		if (con->delay == 0)
+			con->delay = BASE_DELAY_INTERVAL;
+		else if (con->delay < MAX_DELAY_INTERVAL)
+			con->delay *= 2;
+		dout("fault queueing %p delay %lu\n", con, con->delay);
+		con->ops->get(con);
+		if (queue_delayed_work(ceph_msgr_wq, &con->work,
+				       round_jiffies_relative(con->delay)) == 0)
+			con->ops->put(con);
+	}
+
+out_unlock:
+	mutex_unlock(&con->mutex);
+out:
+	/*
+	 * in case we faulted due to authentication, invalidate our
+	 * current tickets so that we can get new ones.
+	 */
+	if (con->auth_retry && con->ops->invalidate_authorizer) {
+		dout("calling invalidate_authorizer()\n");
+		con->ops->invalidate_authorizer(con);
+	}
+
+	if (con->ops->fault)
+		con->ops->fault(con);
+}
+
+
+
+/*
+ * create a new messenger instance
+ */
+struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
+					     u32 supported_features,
+					     u32 required_features)
+{
+	struct ceph_messenger *msgr;
+
+	msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
+	if (msgr == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	msgr->supported_features = supported_features;
+	msgr->required_features = required_features;
+
+	spin_lock_init(&msgr->global_seq_lock);
+
+	/* the zero page is needed if a request is "canceled" while the message
+	 * is being written over the socket */
+	msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
+	if (!msgr->zero_page) {
+		kfree(msgr);
+		return ERR_PTR(-ENOMEM);
+	}
+	kmap(msgr->zero_page);
+
+	if (myaddr)
+		msgr->inst.addr = *myaddr;
+
+	/* select a random nonce */
+	msgr->inst.addr.type = 0;
+	get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
+	encode_my_addr(msgr);
+
+	dout("messenger_create %p\n", msgr);
+	return msgr;
+}
+EXPORT_SYMBOL(ceph_messenger_create);
+
+void ceph_messenger_destroy(struct ceph_messenger *msgr)
+{
+	dout("destroy %p\n", msgr);
+	kunmap(msgr->zero_page);
+	__free_page(msgr->zero_page);
+	kfree(msgr);
+	dout("destroyed messenger %p\n", msgr);
+}
+EXPORT_SYMBOL(ceph_messenger_destroy);
+
+/*
+ * Queue up an outgoing message on the given connection.
+ */
+void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	if (test_bit(CLOSED, &con->state)) {
+		dout("con_send %p closed, dropping %p\n", con, msg);
+		ceph_msg_put(msg);
+		return;
+	}
+
+	/* set src+dst */
+	msg->hdr.src = con->msgr->inst.name;
+
+	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
+
+	msg->needs_out_seq = true;
+
+	/* queue */
+	mutex_lock(&con->mutex);
+	BUG_ON(!list_empty(&msg->list_head));
+	list_add_tail(&msg->list_head, &con->out_queue);
+	dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
+	     ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
+	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+	     le32_to_cpu(msg->hdr.front_len),
+	     le32_to_cpu(msg->hdr.middle_len),
+	     le32_to_cpu(msg->hdr.data_len));
+	mutex_unlock(&con->mutex);
+
+	/* if there wasn't anything waiting to send before, queue
+	 * new work */
+	if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+		queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_send);
+
+/*
+ * Revoke a message that was previously queued for send
+ */
+void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	mutex_lock(&con->mutex);
+	if (!list_empty(&msg->list_head)) {
+		dout("con_revoke %p msg %p - was on queue\n", con, msg);
+		list_del_init(&msg->list_head);
+		ceph_msg_put(msg);
+		msg->hdr.seq = 0;
+	}
+	if (con->out_msg == msg) {
+		dout("con_revoke %p msg %p - was sending\n", con, msg);
+		con->out_msg = NULL;
+		if (con->out_kvec_is_msg) {
+			con->out_skip = con->out_kvec_bytes;
+			con->out_kvec_is_msg = false;
+		}
+		ceph_msg_put(msg);
+		msg->hdr.seq = 0;
+	}
+	mutex_unlock(&con->mutex);
+}
+
+/*
+ * Revoke a message that we may be reading data into
+ */
+void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	mutex_lock(&con->mutex);
+	if (con->in_msg && con->in_msg == msg) {
+		unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
+		unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
+		unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
+
+		/* skip rest of message */
+		dout("con_revoke_pages %p msg %p revoked\n", con, msg);
+			con->in_base_pos = con->in_base_pos -
+				sizeof(struct ceph_msg_header) -
+				front_len -
+				middle_len -
+				data_len -
+				sizeof(struct ceph_msg_footer);
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		con->in_seq++;
+	} else {
+		dout("con_revoke_pages %p msg %p pages %p no-op\n",
+		     con, con->in_msg, msg);
+	}
+	mutex_unlock(&con->mutex);
+}
+
+/*
+ * Queue a keepalive byte to ensure the tcp connection is alive.
+ */
+void ceph_con_keepalive(struct ceph_connection *con)
+{
+	if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
+	    test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+		queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_keepalive);
+
+
+/*
+ * construct a new message with given type, size
+ * the new msg has a ref count of 1.
+ */
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
+{
+	struct ceph_msg *m;
+
+	m = kmalloc(sizeof(*m), flags);
+	if (m == NULL)
+		goto out;
+	kref_init(&m->kref);
+	INIT_LIST_HEAD(&m->list_head);
+
+	m->hdr.tid = 0;
+	m->hdr.type = cpu_to_le16(type);
+	m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+	m->hdr.version = 0;
+	m->hdr.front_len = cpu_to_le32(front_len);
+	m->hdr.middle_len = 0;
+	m->hdr.data_len = 0;
+	m->hdr.data_off = 0;
+	m->hdr.reserved = 0;
+	m->footer.front_crc = 0;
+	m->footer.middle_crc = 0;
+	m->footer.data_crc = 0;
+	m->footer.flags = 0;
+	m->front_max = front_len;
+	m->front_is_vmalloc = false;
+	m->more_to_follow = false;
+	m->pool = NULL;
+
+	/* front */
+	if (front_len) {
+		if (front_len > PAGE_CACHE_SIZE) {
+			m->front.iov_base = __vmalloc(front_len, flags,
+						      PAGE_KERNEL);
+			m->front_is_vmalloc = true;
+		} else {
+			m->front.iov_base = kmalloc(front_len, flags);
+		}
+		if (m->front.iov_base == NULL) {
+			pr_err("msg_new can't allocate %d bytes\n",
+			     front_len);
+			goto out2;
+		}
+	} else {
+		m->front.iov_base = NULL;
+	}
+	m->front.iov_len = front_len;
+
+	/* middle */
+	m->middle = NULL;
+
+	/* data */
+	m->nr_pages = 0;
+	m->pages = NULL;
+	m->pagelist = NULL;
+	m->bio = NULL;
+	m->bio_iter = NULL;
+	m->bio_seg = 0;
+	m->trail = NULL;
+
+	dout("ceph_msg_new %p front %d\n", m, front_len);
+	return m;
+
+out2:
+	ceph_msg_put(m);
+out:
+	pr_err("msg_new can't create type %d front %d\n", type, front_len);
+	return NULL;
+}
+EXPORT_SYMBOL(ceph_msg_new);
+
+/*
+ * Allocate "middle" portion of a message, if it is needed and wasn't
+ * allocated by alloc_msg.  This allows us to read a small fixed-size
+ * per-type header in the front and then gracefully fail (i.e.,
+ * propagate the error to the caller based on info in the front) when
+ * the middle is too large.
+ */
+static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	int type = le16_to_cpu(msg->hdr.type);
+	int middle_len = le32_to_cpu(msg->hdr.middle_len);
+
+	dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
+	     ceph_msg_type_name(type), middle_len);
+	BUG_ON(!middle_len);
+	BUG_ON(msg->middle);
+
+	msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
+	if (!msg->middle)
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ * Generic message allocator, for incoming messages.
+ */
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+				struct ceph_msg_header *hdr,
+				int *skip)
+{
+	int type = le16_to_cpu(hdr->type);
+	int front_len = le32_to_cpu(hdr->front_len);
+	int middle_len = le32_to_cpu(hdr->middle_len);
+	struct ceph_msg *msg = NULL;
+	int ret;
+
+	if (con->ops->alloc_msg) {
+		mutex_unlock(&con->mutex);
+		msg = con->ops->alloc_msg(con, hdr, skip);
+		mutex_lock(&con->mutex);
+		if (!msg || *skip)
+			return NULL;
+	}
+	if (!msg) {
+		*skip = 0;
+		msg = ceph_msg_new(type, front_len, GFP_NOFS);
+		if (!msg) {
+			pr_err("unable to allocate msg type %d len %d\n",
+			       type, front_len);
+			return NULL;
+		}
+	}
+	memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+
+	if (middle_len && !msg->middle) {
+		ret = ceph_alloc_middle(con, msg);
+		if (ret < 0) {
+			ceph_msg_put(msg);
+			return NULL;
+		}
+	}
+
+	return msg;
+}
+
+
+/*
+ * Free a generically kmalloc'd message.
+ */
+void ceph_msg_kfree(struct ceph_msg *m)
+{
+	dout("msg_kfree %p\n", m);
+	if (m->front_is_vmalloc)
+		vfree(m->front.iov_base);
+	else
+		kfree(m->front.iov_base);
+	kfree(m);
+}
+
+/*
+ * Drop a msg ref.  Destroy as needed.
+ */
+void ceph_msg_last_put(struct kref *kref)
+{
+	struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
+
+	dout("ceph_msg_put last one on %p\n", m);
+	WARN_ON(!list_empty(&m->list_head));
+
+	/* drop middle, data, if any */
+	if (m->middle) {
+		ceph_buffer_put(m->middle);
+		m->middle = NULL;
+	}
+	m->nr_pages = 0;
+	m->pages = NULL;
+
+	if (m->pagelist) {
+		ceph_pagelist_release(m->pagelist);
+		kfree(m->pagelist);
+		m->pagelist = NULL;
+	}
+
+	m->trail = NULL;
+
+	if (m->pool)
+		ceph_msgpool_put(m->pool, m);
+	else
+		ceph_msg_kfree(m);
+}
+EXPORT_SYMBOL(ceph_msg_last_put);
+
+void ceph_msg_dump(struct ceph_msg *msg)
+{
+	pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
+		 msg->front_max, msg->nr_pages);
+	print_hex_dump(KERN_DEBUG, "header: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       &msg->hdr, sizeof(msg->hdr), true);
+	print_hex_dump(KERN_DEBUG, " front: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       msg->front.iov_base, msg->front.iov_len, true);
+	if (msg->middle)
+		print_hex_dump(KERN_DEBUG, "middle: ",
+			       DUMP_PREFIX_OFFSET, 16, 1,
+			       msg->middle->vec.iov_base,
+			       msg->middle->vec.iov_len, true);
+	print_hex_dump(KERN_DEBUG, "footer: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       &msg->footer, sizeof(msg->footer), true);
+}
+EXPORT_SYMBOL(ceph_msg_dump);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
new file mode 100644
index 00000000000..8a079399174
--- /dev/null
+++ b/net/ceph/mon_client.c
@@ -0,0 +1,1027 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/decode.h>
+
+#include <linux/ceph/auth.h>
+
+/*
+ * Interact with Ceph monitor cluster.  Handle requests for new map
+ * versions, and periodically resend as needed.  Also implement
+ * statfs() and umount().
+ *
+ * A small cluster of Ceph "monitors" are responsible for managing critical
+ * cluster configuration and state information.  An odd number (e.g., 3, 5)
+ * of cmon daemons use a modified version of the Paxos part-time parliament
+ * algorithm to manage the MDS map (mds cluster membership), OSD map, and
+ * list of clients who have mounted the file system.
+ *
+ * We maintain an open, active session with a monitor at all times in order to
+ * receive timely MDSMap updates.  We periodically send a keepalive byte on the
+ * TCP socket to ensure we detect a failure.  If the connection does break, we
+ * randomly hunt for a new monitor.  Once the connection is reestablished, we
+ * resend any outstanding requests.
+ */
+
+static const struct ceph_connection_operations mon_con_ops;
+
+static int __validate_auth(struct ceph_mon_client *monc);
+
+/*
+ * Decode a monmap blob (e.g., during mount).
+ */
+struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+{
+	struct ceph_monmap *m = NULL;
+	int i, err = -EINVAL;
+	struct ceph_fsid fsid;
+	u32 epoch, num_mon;
+	u16 version;
+	u32 len;
+
+	ceph_decode_32_safe(&p, end, len, bad);
+	ceph_decode_need(&p, end, len, bad);
+
+	dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+
+	ceph_decode_16_safe(&p, end, version, bad);
+
+	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	epoch = ceph_decode_32(&p);
+
+	num_mon = ceph_decode_32(&p);
+	ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
+
+	if (num_mon >= CEPH_MAX_MON)
+		goto bad;
+	m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
+	if (m == NULL)
+		return ERR_PTR(-ENOMEM);
+	m->fsid = fsid;
+	m->epoch = epoch;
+	m->num_mon = num_mon;
+	ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
+	for (i = 0; i < num_mon; i++)
+		ceph_decode_addr(&m->mon_inst[i].addr);
+
+	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
+	     m->num_mon);
+	for (i = 0; i < m->num_mon; i++)
+		dout("monmap_decode  mon%d is %s\n", i,
+		     ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
+	return m;
+
+bad:
+	dout("monmap_decode failed with %d\n", err);
+	kfree(m);
+	return ERR_PTR(err);
+}
+
+/*
+ * return true if *addr is included in the monmap.
+ */
+int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
+{
+	int i;
+
+	for (i = 0; i < m->num_mon; i++)
+		if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+			return 1;
+	return 0;
+}
+
+/*
+ * Send an auth request.
+ */
+static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
+{
+	monc->pending_auth = 1;
+	monc->m_auth->front.iov_len = len;
+	monc->m_auth->hdr.front_len = cpu_to_le32(len);
+	ceph_con_revoke(monc->con, monc->m_auth);
+	ceph_msg_get(monc->m_auth);  /* keep our ref */
+	ceph_con_send(monc->con, monc->m_auth);
+}
+
+/*
+ * Close monitor session, if any.
+ */
+static void __close_session(struct ceph_mon_client *monc)
+{
+	if (monc->con) {
+		dout("__close_session closing mon%d\n", monc->cur_mon);
+		ceph_con_revoke(monc->con, monc->m_auth);
+		ceph_con_close(monc->con);
+		monc->cur_mon = -1;
+		monc->pending_auth = 0;
+		ceph_auth_reset(monc->auth);
+	}
+}
+
+/*
+ * Open a session with a (new) monitor.
+ */
+static int __open_session(struct ceph_mon_client *monc)
+{
+	char r;
+	int ret;
+
+	if (monc->cur_mon < 0) {
+		get_random_bytes(&r, 1);
+		monc->cur_mon = r % monc->monmap->num_mon;
+		dout("open_session num=%d r=%d -> mon%d\n",
+		     monc->monmap->num_mon, r, monc->cur_mon);
+		monc->sub_sent = 0;
+		monc->sub_renew_after = jiffies;  /* i.e., expired */
+		monc->want_next_osdmap = !!monc->want_next_osdmap;
+
+		dout("open_session mon%d opening\n", monc->cur_mon);
+		monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
+		monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
+		ceph_con_open(monc->con,
+			      &monc->monmap->mon_inst[monc->cur_mon].addr);
+
+		/* initiatiate authentication handshake */
+		ret = ceph_auth_build_hello(monc->auth,
+					    monc->m_auth->front.iov_base,
+					    monc->m_auth->front_max);
+		__send_prepared_auth_request(monc, ret);
+	} else {
+		dout("open_session mon%d already open\n", monc->cur_mon);
+	}
+	return 0;
+}
+
+static bool __sub_expired(struct ceph_mon_client *monc)
+{
+	return time_after_eq(jiffies, monc->sub_renew_after);
+}
+
+/*
+ * Reschedule delayed work timer.
+ */
+static void __schedule_delayed(struct ceph_mon_client *monc)
+{
+	unsigned delay;
+
+	if (monc->cur_mon < 0 || __sub_expired(monc))
+		delay = 10 * HZ;
+	else
+		delay = 20 * HZ;
+	dout("__schedule_delayed after %u\n", delay);
+	schedule_delayed_work(&monc->delayed_work, delay);
+}
+
+/*
+ * Send subscribe request for mdsmap and/or osdmap.
+ */
+static void __send_subscribe(struct ceph_mon_client *monc)
+{
+	dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
+	     (unsigned)monc->sub_sent, __sub_expired(monc),
+	     monc->want_next_osdmap);
+	if ((__sub_expired(monc) && !monc->sub_sent) ||
+	    monc->want_next_osdmap == 1) {
+		struct ceph_msg *msg = monc->m_subscribe;
+		struct ceph_mon_subscribe_item *i;
+		void *p, *end;
+		int num;
+
+		p = msg->front.iov_base;
+		end = p + msg->front_max;
+
+		num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
+		ceph_encode_32(&p, num);
+
+		if (monc->want_next_osdmap) {
+			dout("__send_subscribe to 'osdmap' %u\n",
+			     (unsigned)monc->have_osdmap);
+			ceph_encode_string(&p, end, "osdmap", 6);
+			i = p;
+			i->have = cpu_to_le64(monc->have_osdmap);
+			i->onetime = 1;
+			p += sizeof(*i);
+			monc->want_next_osdmap = 2;  /* requested */
+		}
+		if (monc->want_mdsmap) {
+			dout("__send_subscribe to 'mdsmap' %u+\n",
+			     (unsigned)monc->have_mdsmap);
+			ceph_encode_string(&p, end, "mdsmap", 6);
+			i = p;
+			i->have = cpu_to_le64(monc->have_mdsmap);
+			i->onetime = 0;
+			p += sizeof(*i);
+		}
+		ceph_encode_string(&p, end, "monmap", 6);
+		i = p;
+		i->have = 0;
+		i->onetime = 0;
+		p += sizeof(*i);
+
+		msg->front.iov_len = p - msg->front.iov_base;
+		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+		ceph_con_revoke(monc->con, msg);
+		ceph_con_send(monc->con, ceph_msg_get(msg));
+
+		monc->sub_sent = jiffies | 1;  /* never 0 */
+	}
+}
+
+static void handle_subscribe_ack(struct ceph_mon_client *monc,
+				 struct ceph_msg *msg)
+{
+	unsigned seconds;
+	struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
+
+	if (msg->front.iov_len < sizeof(*h))
+		goto bad;
+	seconds = le32_to_cpu(h->duration);
+
+	mutex_lock(&monc->mutex);
+	if (monc->hunting) {
+		pr_info("mon%d %s session established\n",
+			monc->cur_mon,
+			ceph_pr_addr(&monc->con->peer_addr.in_addr));
+		monc->hunting = false;
+	}
+	dout("handle_subscribe_ack after %d seconds\n", seconds);
+	monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
+	monc->sub_sent = 0;
+	mutex_unlock(&monc->mutex);
+	return;
+bad:
+	pr_err("got corrupt subscribe-ack msg\n");
+	ceph_msg_dump(msg);
+}
+
+/*
+ * Keep track of which maps we have
+ */
+int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
+{
+	mutex_lock(&monc->mutex);
+	monc->have_mdsmap = got;
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_monc_got_mdsmap);
+
+int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
+{
+	mutex_lock(&monc->mutex);
+	monc->have_osdmap = got;
+	monc->want_next_osdmap = 0;
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+/*
+ * Register interest in the next osdmap
+ */
+void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+{
+	dout("request_next_osdmap have %u\n", monc->have_osdmap);
+	mutex_lock(&monc->mutex);
+	if (!monc->want_next_osdmap)
+		monc->want_next_osdmap = 1;
+	if (monc->want_next_osdmap < 2)
+		__send_subscribe(monc);
+	mutex_unlock(&monc->mutex);
+}
+
+/*
+ *
+ */
+int ceph_monc_open_session(struct ceph_mon_client *monc)
+{
+	if (!monc->con) {
+		monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
+		if (!monc->con)
+			return -ENOMEM;
+		ceph_con_init(monc->client->msgr, monc->con);
+		monc->con->private = monc;
+		monc->con->ops = &mon_con_ops;
+	}
+
+	mutex_lock(&monc->mutex);
+	__open_session(monc);
+	__schedule_delayed(monc);
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_monc_open_session);
+
+/*
+ * The monitor responds with mount ack indicate mount success.  The
+ * included client ticket allows the client to talk to MDSs and OSDs.
+ */
+static void ceph_monc_handle_map(struct ceph_mon_client *monc,
+				 struct ceph_msg *msg)
+{
+	struct ceph_client *client = monc->client;
+	struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+	void *p, *end;
+
+	mutex_lock(&monc->mutex);
+
+	dout("handle_monmap\n");
+	p = msg->front.iov_base;
+	end = p + msg->front.iov_len;
+
+	monmap = ceph_monmap_decode(p, end);
+	if (IS_ERR(monmap)) {
+		pr_err("problem decoding monmap, %d\n",
+		       (int)PTR_ERR(monmap));
+		goto out;
+	}
+
+	if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
+		kfree(monmap);
+		goto out;
+	}
+
+	client->monc.monmap = monmap;
+	kfree(old);
+
+out:
+	mutex_unlock(&monc->mutex);
+	wake_up_all(&client->auth_wq);
+}
+
+/*
+ * generic requests (e.g., statfs, poolop)
+ */
+static struct ceph_mon_generic_request *__lookup_generic_req(
+	struct ceph_mon_client *monc, u64 tid)
+{
+	struct ceph_mon_generic_request *req;
+	struct rb_node *n = monc->generic_request_tree.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_mon_generic_request, node);
+		if (tid < req->tid)
+			n = n->rb_left;
+		else if (tid > req->tid)
+			n = n->rb_right;
+		else
+			return req;
+	}
+	return NULL;
+}
+
+static void __insert_generic_request(struct ceph_mon_client *monc,
+			    struct ceph_mon_generic_request *new)
+{
+	struct rb_node **p = &monc->generic_request_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_mon_generic_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_mon_generic_request, node);
+		if (new->tid < req->tid)
+			p = &(*p)->rb_left;
+		else if (new->tid > req->tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, &monc->generic_request_tree);
+}
+
+static void release_generic_request(struct kref *kref)
+{
+	struct ceph_mon_generic_request *req =
+		container_of(kref, struct ceph_mon_generic_request, kref);
+
+	if (req->reply)
+		ceph_msg_put(req->reply);
+	if (req->request)
+		ceph_msg_put(req->request);
+
+	kfree(req);
+}
+
+static void put_generic_request(struct ceph_mon_generic_request *req)
+{
+	kref_put(&req->kref, release_generic_request);
+}
+
+static void get_generic_request(struct ceph_mon_generic_request *req)
+{
+	kref_get(&req->kref);
+}
+
+static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
+					 struct ceph_msg_header *hdr,
+					 int *skip)
+{
+	struct ceph_mon_client *monc = con->private;
+	struct ceph_mon_generic_request *req;
+	u64 tid = le64_to_cpu(hdr->tid);
+	struct ceph_msg *m;
+
+	mutex_lock(&monc->mutex);
+	req = __lookup_generic_req(monc, tid);
+	if (!req) {
+		dout("get_generic_reply %lld dne\n", tid);
+		*skip = 1;
+		m = NULL;
+	} else {
+		dout("get_generic_reply %lld got %p\n", tid, req->reply);
+		m = ceph_msg_get(req->reply);
+		/*
+		 * we don't need to track the connection reading into
+		 * this reply because we only have one open connection
+		 * at a time, ever.
+		 */
+	}
+	mutex_unlock(&monc->mutex);
+	return m;
+}
+
+static int do_generic_request(struct ceph_mon_client *monc,
+			      struct ceph_mon_generic_request *req)
+{
+	int err;
+
+	/* register request */
+	mutex_lock(&monc->mutex);
+	req->tid = ++monc->last_tid;
+	req->request->hdr.tid = cpu_to_le64(req->tid);
+	__insert_generic_request(monc, req);
+	monc->num_generic_requests++;
+	ceph_con_send(monc->con, ceph_msg_get(req->request));
+	mutex_unlock(&monc->mutex);
+
+	err = wait_for_completion_interruptible(&req->completion);
+
+	mutex_lock(&monc->mutex);
+	rb_erase(&req->node, &monc->generic_request_tree);
+	monc->num_generic_requests--;
+	mutex_unlock(&monc->mutex);
+
+	if (!err)
+		err = req->result;
+	return err;
+}
+
+/*
+ * statfs
+ */
+static void handle_statfs_reply(struct ceph_mon_client *monc,
+				struct ceph_msg *msg)
+{
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+
+	if (msg->front.iov_len != sizeof(*reply))
+		goto bad;
+	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
+
+	mutex_lock(&monc->mutex);
+	req = __lookup_generic_req(monc, tid);
+	if (req) {
+		*(struct ceph_statfs *)req->buf = reply->st;
+		req->result = 0;
+		get_generic_request(req);
+	}
+	mutex_unlock(&monc->mutex);
+	if (req) {
+		complete_all(&req->completion);
+		put_generic_request(req);
+	}
+	return;
+
+bad:
+	pr_err("corrupt generic reply, tid %llu\n", tid);
+	ceph_msg_dump(msg);
+}
+
+/*
+ * Do a synchronous statfs().
+ */
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+{
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_statfs *h;
+	int err;
+
+	req = kzalloc(sizeof(*req), GFP_NOFS);
+	if (!req)
+		return -ENOMEM;
+
+	kref_init(&req->kref);
+	req->buf = buf;
+	req->buf_len = sizeof(*buf);
+	init_completion(&req->completion);
+
+	err = -ENOMEM;
+	req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
+	if (!req->request)
+		goto out;
+	req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
+	if (!req->reply)
+		goto out;
+
+	/* fill out request */
+	h = req->request->front.iov_base;
+	h->monhdr.have_version = 0;
+	h->monhdr.session_mon = cpu_to_le16(-1);
+	h->monhdr.session_mon_tid = 0;
+	h->fsid = monc->monmap->fsid;
+
+	err = do_generic_request(monc, req);
+
+out:
+	kref_put(&req->kref, release_generic_request);
+	return err;
+}
+EXPORT_SYMBOL(ceph_monc_do_statfs);
+
+/*
+ * pool ops
+ */
+static int get_poolop_reply_buf(const char *src, size_t src_len,
+				char *dst, size_t dst_len)
+{
+	u32 buf_len;
+
+	if (src_len != sizeof(u32) + dst_len)
+		return -EINVAL;
+
+	buf_len = le32_to_cpu(*(u32 *)src);
+	if (buf_len != dst_len)
+		return -EINVAL;
+
+	memcpy(dst, src + sizeof(u32), dst_len);
+	return 0;
+}
+
+static void handle_poolop_reply(struct ceph_mon_client *monc,
+				struct ceph_msg *msg)
+{
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+
+	if (msg->front.iov_len < sizeof(*reply))
+		goto bad;
+	dout("handle_poolop_reply %p tid %llu\n", msg, tid);
+
+	mutex_lock(&monc->mutex);
+	req = __lookup_generic_req(monc, tid);
+	if (req) {
+		if (req->buf_len &&
+		    get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
+				     msg->front.iov_len - sizeof(*reply),
+				     req->buf, req->buf_len) < 0) {
+			mutex_unlock(&monc->mutex);
+			goto bad;
+		}
+		req->result = le32_to_cpu(reply->reply_code);
+		get_generic_request(req);
+	}
+	mutex_unlock(&monc->mutex);
+	if (req) {
+		complete(&req->completion);
+		put_generic_request(req);
+	}
+	return;
+
+bad:
+	pr_err("corrupt generic reply, tid %llu\n", tid);
+	ceph_msg_dump(msg);
+}
+
+/*
+ * Do a synchronous pool op.
+ */
+int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
+			u32 pool, u64 snapid,
+			char *buf, int len)
+{
+	struct ceph_mon_generic_request *req;
+	struct ceph_mon_poolop *h;
+	int err;
+
+	req = kzalloc(sizeof(*req), GFP_NOFS);
+	if (!req)
+		return -ENOMEM;
+
+	kref_init(&req->kref);
+	req->buf = buf;
+	req->buf_len = len;
+	init_completion(&req->completion);
+
+	err = -ENOMEM;
+	req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
+	if (!req->request)
+		goto out;
+	req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
+	if (!req->reply)
+		goto out;
+
+	/* fill out request */
+	req->request->hdr.version = cpu_to_le16(2);
+	h = req->request->front.iov_base;
+	h->monhdr.have_version = 0;
+	h->monhdr.session_mon = cpu_to_le16(-1);
+	h->monhdr.session_mon_tid = 0;
+	h->fsid = monc->monmap->fsid;
+	h->pool = cpu_to_le32(pool);
+	h->op = cpu_to_le32(op);
+	h->auid = 0;
+	h->snapid = cpu_to_le64(snapid);
+	h->name_len = 0;
+
+	err = do_generic_request(monc, req);
+
+out:
+	kref_put(&req->kref, release_generic_request);
+	return err;
+}
+
+int ceph_monc_create_snapid(struct ceph_mon_client *monc,
+			    u32 pool, u64 *snapid)
+{
+	return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
+				   pool, 0, (char *)snapid, sizeof(*snapid));
+
+}
+EXPORT_SYMBOL(ceph_monc_create_snapid);
+
+int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
+			    u32 pool, u64 snapid)
+{
+	return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
+				   pool, snapid, 0, 0);
+
+}
+
+/*
+ * Resend pending generic requests.
+ */
+static void __resend_generic_request(struct ceph_mon_client *monc)
+{
+	struct ceph_mon_generic_request *req;
+	struct rb_node *p;
+
+	for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_mon_generic_request, node);
+		ceph_con_revoke(monc->con, req->request);
+		ceph_con_send(monc->con, ceph_msg_get(req->request));
+	}
+}
+
+/*
+ * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
+ * renew/retry subscription as needed (in case it is timing out, or we
+ * got an ENOMEM).  And keep the monitor connection alive.
+ */
+static void delayed_work(struct work_struct *work)
+{
+	struct ceph_mon_client *monc =
+		container_of(work, struct ceph_mon_client, delayed_work.work);
+
+	dout("monc delayed_work\n");
+	mutex_lock(&monc->mutex);
+	if (monc->hunting) {
+		__close_session(monc);
+		__open_session(monc);  /* continue hunting */
+	} else {
+		ceph_con_keepalive(monc->con);
+
+		__validate_auth(monc);
+
+		if (monc->auth->ops->is_authenticated(monc->auth))
+			__send_subscribe(monc);
+	}
+	__schedule_delayed(monc);
+	mutex_unlock(&monc->mutex);
+}
+
+/*
+ * On startup, we build a temporary monmap populated with the IPs
+ * provided by mount(2).
+ */
+static int build_initial_monmap(struct ceph_mon_client *monc)
+{
+	struct ceph_options *opt = monc->client->options;
+	struct ceph_entity_addr *mon_addr = opt->mon_addr;
+	int num_mon = opt->num_mon;
+	int i;
+
+	/* build initial monmap */
+	monc->monmap = kzalloc(sizeof(*monc->monmap) +
+			       num_mon*sizeof(monc->monmap->mon_inst[0]),
+			       GFP_KERNEL);
+	if (!monc->monmap)
+		return -ENOMEM;
+	for (i = 0; i < num_mon; i++) {
+		monc->monmap->mon_inst[i].addr = mon_addr[i];
+		monc->monmap->mon_inst[i].addr.nonce = 0;
+		monc->monmap->mon_inst[i].name.type =
+			CEPH_ENTITY_TYPE_MON;
+		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+	}
+	monc->monmap->num_mon = num_mon;
+	monc->have_fsid = false;
+	return 0;
+}
+
+int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
+{
+	int err = 0;
+
+	dout("init\n");
+	memset(monc, 0, sizeof(*monc));
+	monc->client = cl;
+	monc->monmap = NULL;
+	mutex_init(&monc->mutex);
+
+	err = build_initial_monmap(monc);
+	if (err)
+		goto out;
+
+	monc->con = NULL;
+
+	/* authentication */
+	monc->auth = ceph_auth_init(cl->options->name,
+				    cl->options->secret);
+	if (IS_ERR(monc->auth))
+		return PTR_ERR(monc->auth);
+	monc->auth->want_keys =
+		CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
+		CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
+
+	/* msgs */
+	err = -ENOMEM;
+	monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
+				     sizeof(struct ceph_mon_subscribe_ack),
+				     GFP_NOFS);
+	if (!monc->m_subscribe_ack)
+		goto out_monmap;
+
+	monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
+	if (!monc->m_subscribe)
+		goto out_subscribe_ack;
+
+	monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
+	if (!monc->m_auth_reply)
+		goto out_subscribe;
+
+	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
+	monc->pending_auth = 0;
+	if (!monc->m_auth)
+		goto out_auth_reply;
+
+	monc->cur_mon = -1;
+	monc->hunting = true;
+	monc->sub_renew_after = jiffies;
+	monc->sub_sent = 0;
+
+	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
+	monc->generic_request_tree = RB_ROOT;
+	monc->num_generic_requests = 0;
+	monc->last_tid = 0;
+
+	monc->have_mdsmap = 0;
+	monc->have_osdmap = 0;
+	monc->want_next_osdmap = 1;
+	return 0;
+
+out_auth_reply:
+	ceph_msg_put(monc->m_auth_reply);
+out_subscribe:
+	ceph_msg_put(monc->m_subscribe);
+out_subscribe_ack:
+	ceph_msg_put(monc->m_subscribe_ack);
+out_monmap:
+	kfree(monc->monmap);
+out:
+	return err;
+}
+EXPORT_SYMBOL(ceph_monc_init);
+
+void ceph_monc_stop(struct ceph_mon_client *monc)
+{
+	dout("stop\n");
+	cancel_delayed_work_sync(&monc->delayed_work);
+
+	mutex_lock(&monc->mutex);
+	__close_session(monc);
+	if (monc->con) {
+		monc->con->private = NULL;
+		monc->con->ops->put(monc->con);
+		monc->con = NULL;
+	}
+	mutex_unlock(&monc->mutex);
+
+	ceph_auth_destroy(monc->auth);
+
+	ceph_msg_put(monc->m_auth);
+	ceph_msg_put(monc->m_auth_reply);
+	ceph_msg_put(monc->m_subscribe);
+	ceph_msg_put(monc->m_subscribe_ack);
+
+	kfree(monc->monmap);
+}
+EXPORT_SYMBOL(ceph_monc_stop);
+
+static void handle_auth_reply(struct ceph_mon_client *monc,
+			      struct ceph_msg *msg)
+{
+	int ret;
+	int was_auth = 0;
+
+	mutex_lock(&monc->mutex);
+	if (monc->auth->ops)
+		was_auth = monc->auth->ops->is_authenticated(monc->auth);
+	monc->pending_auth = 0;
+	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+				     msg->front.iov_len,
+				     monc->m_auth->front.iov_base,
+				     monc->m_auth->front_max);
+	if (ret < 0) {
+		monc->client->auth_err = ret;
+		wake_up_all(&monc->client->auth_wq);
+	} else if (ret > 0) {
+		__send_prepared_auth_request(monc, ret);
+	} else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
+		dout("authenticated, starting session\n");
+
+		monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+		monc->client->msgr->inst.name.num =
+					cpu_to_le64(monc->auth->global_id);
+
+		__send_subscribe(monc);
+		__resend_generic_request(monc);
+	}
+	mutex_unlock(&monc->mutex);
+}
+
+static int __validate_auth(struct ceph_mon_client *monc)
+{
+	int ret;
+
+	if (monc->pending_auth)
+		return 0;
+
+	ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
+			      monc->m_auth->front_max);
+	if (ret <= 0)
+		return ret; /* either an error, or no need to authenticate */
+	__send_prepared_auth_request(monc, ret);
+	return 0;
+}
+
+int ceph_monc_validate_auth(struct ceph_mon_client *monc)
+{
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	ret = __validate_auth(monc);
+	mutex_unlock(&monc->mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_monc_validate_auth);
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_mon_client *monc = con->private;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	if (!monc)
+		return;
+
+	switch (type) {
+	case CEPH_MSG_AUTH_REPLY:
+		handle_auth_reply(monc, msg);
+		break;
+
+	case CEPH_MSG_MON_SUBSCRIBE_ACK:
+		handle_subscribe_ack(monc, msg);
+		break;
+
+	case CEPH_MSG_STATFS_REPLY:
+		handle_statfs_reply(monc, msg);
+		break;
+
+	case CEPH_MSG_POOLOP_REPLY:
+		handle_poolop_reply(monc, msg);
+		break;
+
+	case CEPH_MSG_MON_MAP:
+		ceph_monc_handle_map(monc, msg);
+		break;
+
+	case CEPH_MSG_OSD_MAP:
+		ceph_osdc_handle_map(&monc->client->osdc, msg);
+		break;
+
+	default:
+		/* can the chained handler handle it? */
+		if (monc->client->extra_mon_dispatch &&
+		    monc->client->extra_mon_dispatch(monc->client, msg) == 0)
+			break;
+			
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+	ceph_msg_put(msg);
+}
+
+/*
+ * Allocate memory for incoming message
+ */
+static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
+				      struct ceph_msg_header *hdr,
+				      int *skip)
+{
+	struct ceph_mon_client *monc = con->private;
+	int type = le16_to_cpu(hdr->type);
+	int front_len = le32_to_cpu(hdr->front_len);
+	struct ceph_msg *m = NULL;
+
+	*skip = 0;
+
+	switch (type) {
+	case CEPH_MSG_MON_SUBSCRIBE_ACK:
+		m = ceph_msg_get(monc->m_subscribe_ack);
+		break;
+	case CEPH_MSG_POOLOP_REPLY:
+	case CEPH_MSG_STATFS_REPLY:
+		return get_generic_reply(con, hdr, skip);
+	case CEPH_MSG_AUTH_REPLY:
+		m = ceph_msg_get(monc->m_auth_reply);
+		break;
+	case CEPH_MSG_MON_MAP:
+	case CEPH_MSG_MDS_MAP:
+	case CEPH_MSG_OSD_MAP:
+		m = ceph_msg_new(type, front_len, GFP_NOFS);
+		break;
+	}
+
+	if (!m) {
+		pr_info("alloc_msg unknown type %d\n", type);
+		*skip = 1;
+	}
+	return m;
+}
+
+/*
+ * If the monitor connection resets, pick a new monitor and resubmit
+ * any pending requests.
+ */
+static void mon_fault(struct ceph_connection *con)
+{
+	struct ceph_mon_client *monc = con->private;
+
+	if (!monc)
+		return;
+
+	dout("mon_fault\n");
+	mutex_lock(&monc->mutex);
+	if (!con->private)
+		goto out;
+
+	if (monc->con && !monc->hunting)
+		pr_info("mon%d %s session lost, "
+			"hunting for new mon\n", monc->cur_mon,
+			ceph_pr_addr(&monc->con->peer_addr.in_addr));
+
+	__close_session(monc);
+	if (!monc->hunting) {
+		/* start hunting */
+		monc->hunting = true;
+		__open_session(monc);
+	} else {
+		/* already hunting, let's wait a bit */
+		__schedule_delayed(monc);
+	}
+out:
+	mutex_unlock(&monc->mutex);
+}
+
+static const struct ceph_connection_operations mon_con_ops = {
+	.get = ceph_con_get,
+	.put = ceph_con_put,
+	.dispatch = dispatch,
+	.fault = mon_fault,
+	.alloc_msg = mon_alloc_msg,
+};
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
new file mode 100644
index 00000000000..d5f2d97ac05
--- /dev/null
+++ b/net/ceph/msgpool.c
@@ -0,0 +1,64 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include <linux/ceph/msgpool.h>
+
+static void *alloc_fn(gfp_t gfp_mask, void *arg)
+{
+	struct ceph_msgpool *pool = arg;
+	void *p;
+
+	p = ceph_msg_new(0, pool->front_len, gfp_mask);
+	if (!p)
+		pr_err("msgpool %s alloc failed\n", pool->name);
+	return p;
+}
+
+static void free_fn(void *element, void *arg)
+{
+	ceph_msg_put(element);
+}
+
+int ceph_msgpool_init(struct ceph_msgpool *pool,
+		      int front_len, int size, bool blocking, const char *name)
+{
+	pool->front_len = front_len;
+	pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
+	if (!pool->pool)
+		return -ENOMEM;
+	pool->name = name;
+	return 0;
+}
+
+void ceph_msgpool_destroy(struct ceph_msgpool *pool)
+{
+	mempool_destroy(pool->pool);
+}
+
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
+				  int front_len)
+{
+	if (front_len > pool->front_len) {
+		pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
+		       pool->name, front_len, pool->front_len);
+		WARN_ON(1);
+
+		/* try to alloc a fresh message */
+		return ceph_msg_new(0, front_len, GFP_NOFS);
+	}
+
+	return mempool_alloc(pool->pool, GFP_NOFS);
+}
+
+void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
+{
+	/* reset msg front_len; user may have changed it */
+	msg->front.iov_len = pool->front_len;
+	msg->hdr.front_len = cpu_to_le32(pool->front_len);
+
+	kref_init(&msg->kref);  /* retake single ref */
+}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
new file mode 100644
index 00000000000..79391994b3e
--- /dev/null
+++ b/net/ceph/osd_client.c
@@ -0,0 +1,1773 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#ifdef CONFIG_BLOCK
+#include <linux/bio.h>
+#endif
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/pagelist.h>
+
+#define OSD_OP_FRONT_LEN	4096
+#define OSD_OPREPLY_FRONT_LEN	512
+
+static const struct ceph_connection_operations osd_con_ops;
+static int __kick_requests(struct ceph_osd_client *osdc,
+			  struct ceph_osd *kickosd);
+
+static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
+
+static int op_needs_trail(int op)
+{
+	switch (op) {
+	case CEPH_OSD_OP_GETXATTR:
+	case CEPH_OSD_OP_SETXATTR:
+	case CEPH_OSD_OP_CMPXATTR:
+	case CEPH_OSD_OP_CALL:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static int op_has_extent(int op)
+{
+	return (op == CEPH_OSD_OP_READ ||
+		op == CEPH_OSD_OP_WRITE);
+}
+
+void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+			struct ceph_file_layout *layout,
+			u64 snapid,
+			u64 off, u64 *plen, u64 *bno,
+			struct ceph_osd_request *req,
+			struct ceph_osd_req_op *op)
+{
+	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+	u64 orig_len = *plen;
+	u64 objoff, objlen;    /* extent in object */
+
+	reqhead->snapid = cpu_to_le64(snapid);
+
+	/* object extent? */
+	ceph_calc_file_object_mapping(layout, off, plen, bno,
+				      &objoff, &objlen);
+	if (*plen < orig_len)
+		dout(" skipping last %llu, final file extent %llu~%llu\n",
+		     orig_len - *plen, off, *plen);
+
+	if (op_has_extent(op->op)) {
+		op->extent.offset = objoff;
+		op->extent.length = objlen;
+	}
+	req->r_num_pages = calc_pages_for(off, *plen);
+	if (op->op == CEPH_OSD_OP_WRITE)
+		op->payload_len = *plen;
+
+	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
+	     *bno, objoff, objlen, req->r_num_pages);
+
+}
+EXPORT_SYMBOL(ceph_calc_raw_layout);
+
+/*
+ * Implement client access to distributed object storage cluster.
+ *
+ * All data objects are stored within a cluster/cloud of OSDs, or
+ * "object storage devices."  (Note that Ceph OSDs have _nothing_ to
+ * do with the T10 OSD extensions to SCSI.)  Ceph OSDs are simply
+ * remote daemons serving up and coordinating consistent and safe
+ * access to storage.
+ *
+ * Cluster membership and the mapping of data objects onto storage devices
+ * are described by the osd map.
+ *
+ * We keep track of pending OSD requests (read, write), resubmit
+ * requests to different OSDs when the cluster topology/data layout
+ * change, or retry the affected requests when the communications
+ * channel with an OSD is reset.
+ */
+
+/*
+ * calculate the mapping of a file extent onto an object, and fill out the
+ * request accordingly.  shorten extent as necessary if it crosses an
+ * object boundary.
+ *
+ * fill osd op in request message.
+ */
+static void calc_layout(struct ceph_osd_client *osdc,
+			struct ceph_vino vino,
+			struct ceph_file_layout *layout,
+			u64 off, u64 *plen,
+			struct ceph_osd_request *req,
+			struct ceph_osd_req_op *op)
+{
+	u64 bno;
+
+	ceph_calc_raw_layout(osdc, layout, vino.snap, off,
+			     plen, &bno, req, op);
+
+	sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
+	req->r_oid_len = strlen(req->r_oid);
+}
+
+/*
+ * requests
+ */
+void ceph_osdc_release_request(struct kref *kref)
+{
+	struct ceph_osd_request *req = container_of(kref,
+						    struct ceph_osd_request,
+						    r_kref);
+
+	if (req->r_request)
+		ceph_msg_put(req->r_request);
+	if (req->r_reply)
+		ceph_msg_put(req->r_reply);
+	if (req->r_con_filling_msg) {
+		dout("release_request revoking pages %p from con %p\n",
+		     req->r_pages, req->r_con_filling_msg);
+		ceph_con_revoke_message(req->r_con_filling_msg,
+				      req->r_reply);
+		ceph_con_put(req->r_con_filling_msg);
+	}
+	if (req->r_own_pages)
+		ceph_release_page_vector(req->r_pages,
+					 req->r_num_pages);
+#ifdef CONFIG_BLOCK
+	if (req->r_bio)
+		bio_put(req->r_bio);
+#endif
+	ceph_put_snap_context(req->r_snapc);
+	if (req->r_trail) {
+		ceph_pagelist_release(req->r_trail);
+		kfree(req->r_trail);
+	}
+	if (req->r_mempool)
+		mempool_free(req, req->r_osdc->req_mempool);
+	else
+		kfree(req);
+}
+EXPORT_SYMBOL(ceph_osdc_release_request);
+
+static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
+{
+	int i = 0;
+
+	if (needs_trail)
+		*needs_trail = 0;
+	while (ops[i].op) {
+		if (needs_trail && op_needs_trail(ops[i].op))
+			*needs_trail = 1;
+		i++;
+	}
+
+	return i;
+}
+
+struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+					       int flags,
+					       struct ceph_snap_context *snapc,
+					       struct ceph_osd_req_op *ops,
+					       bool use_mempool,
+					       gfp_t gfp_flags,
+					       struct page **pages,
+					       struct bio *bio)
+{
+	struct ceph_osd_request *req;
+	struct ceph_msg *msg;
+	int needs_trail;
+	int num_op = get_num_ops(ops, &needs_trail);
+	size_t msg_size = sizeof(struct ceph_osd_request_head);
+
+	msg_size += num_op*sizeof(struct ceph_osd_op);
+
+	if (use_mempool) {
+		req = mempool_alloc(osdc->req_mempool, gfp_flags);
+		memset(req, 0, sizeof(*req));
+	} else {
+		req = kzalloc(sizeof(*req), gfp_flags);
+	}
+	if (req == NULL)
+		return NULL;
+
+	req->r_osdc = osdc;
+	req->r_mempool = use_mempool;
+
+	kref_init(&req->r_kref);
+	init_completion(&req->r_completion);
+	init_completion(&req->r_safe_completion);
+	INIT_LIST_HEAD(&req->r_unsafe_item);
+	req->r_flags = flags;
+
+	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
+
+	/* create reply message */
+	if (use_mempool)
+		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+	else
+		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
+				   OSD_OPREPLY_FRONT_LEN, gfp_flags);
+	if (!msg) {
+		ceph_osdc_put_request(req);
+		return NULL;
+	}
+	req->r_reply = msg;
+
+	/* allocate space for the trailing data */
+	if (needs_trail) {
+		req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
+		if (!req->r_trail) {
+			ceph_osdc_put_request(req);
+			return NULL;
+		}
+		ceph_pagelist_init(req->r_trail);
+	}
+	/* create request message; allow space for oid */
+	msg_size += 40;
+	if (snapc)
+		msg_size += sizeof(u64) * snapc->num_snaps;
+	if (use_mempool)
+		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
+	else
+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags);
+	if (!msg) {
+		ceph_osdc_put_request(req);
+		return NULL;
+	}
+
+	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
+	memset(msg->front.iov_base, 0, msg->front.iov_len);
+
+	req->r_request = msg;
+	req->r_pages = pages;
+#ifdef CONFIG_BLOCK
+	if (bio) {
+		req->r_bio = bio;
+		bio_get(req->r_bio);
+	}
+#endif
+
+	return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
+
+static void osd_req_encode_op(struct ceph_osd_request *req,
+			      struct ceph_osd_op *dst,
+			      struct ceph_osd_req_op *src)
+{
+	dst->op = cpu_to_le16(src->op);
+
+	switch (dst->op) {
+	case CEPH_OSD_OP_READ:
+	case CEPH_OSD_OP_WRITE:
+		dst->extent.offset =
+			cpu_to_le64(src->extent.offset);
+		dst->extent.length =
+			cpu_to_le64(src->extent.length);
+		dst->extent.truncate_size =
+			cpu_to_le64(src->extent.truncate_size);
+		dst->extent.truncate_seq =
+			cpu_to_le32(src->extent.truncate_seq);
+		break;
+
+	case CEPH_OSD_OP_GETXATTR:
+	case CEPH_OSD_OP_SETXATTR:
+	case CEPH_OSD_OP_CMPXATTR:
+		BUG_ON(!req->r_trail);
+
+		dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
+		dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
+		dst->xattr.cmp_op = src->xattr.cmp_op;
+		dst->xattr.cmp_mode = src->xattr.cmp_mode;
+		ceph_pagelist_append(req->r_trail, src->xattr.name,
+				     src->xattr.name_len);
+		ceph_pagelist_append(req->r_trail, src->xattr.val,
+				     src->xattr.value_len);
+		break;
+	case CEPH_OSD_OP_CALL:
+		BUG_ON(!req->r_trail);
+
+		dst->cls.class_len = src->cls.class_len;
+		dst->cls.method_len = src->cls.method_len;
+		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
+
+		ceph_pagelist_append(req->r_trail, src->cls.class_name,
+				     src->cls.class_len);
+		ceph_pagelist_append(req->r_trail, src->cls.method_name,
+				     src->cls.method_len);
+		ceph_pagelist_append(req->r_trail, src->cls.indata,
+				     src->cls.indata_len);
+		break;
+	case CEPH_OSD_OP_ROLLBACK:
+		dst->snap.snapid = cpu_to_le64(src->snap.snapid);
+		break;
+	case CEPH_OSD_OP_STARTSYNC:
+		break;
+	default:
+		pr_err("unrecognized osd opcode %d\n", dst->op);
+		WARN_ON(1);
+		break;
+	}
+	dst->payload_len = cpu_to_le32(src->payload_len);
+}
+
+/*
+ * build new request AND message
+ *
+ */
+void ceph_osdc_build_request(struct ceph_osd_request *req,
+			     u64 off, u64 *plen,
+			     struct ceph_osd_req_op *src_ops,
+			     struct ceph_snap_context *snapc,
+			     struct timespec *mtime,
+			     const char *oid,
+			     int oid_len)
+{
+	struct ceph_msg *msg = req->r_request;
+	struct ceph_osd_request_head *head;
+	struct ceph_osd_req_op *src_op;
+	struct ceph_osd_op *op;
+	void *p;
+	int num_op = get_num_ops(src_ops, NULL);
+	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+	int flags = req->r_flags;
+	u64 data_len = 0;
+	int i;
+
+	head = msg->front.iov_base;
+	op = (void *)(head + 1);
+	p = (void *)(op + num_op);
+
+	req->r_snapc = ceph_get_snap_context(snapc);
+
+	head->client_inc = cpu_to_le32(1); /* always, for now. */
+	head->flags = cpu_to_le32(flags);
+	if (flags & CEPH_OSD_FLAG_WRITE)
+		ceph_encode_timespec(&head->mtime, mtime);
+	head->num_ops = cpu_to_le16(num_op);
+
+
+	/* fill in oid */
+	head->object_len = cpu_to_le32(oid_len);
+	memcpy(p, oid, oid_len);
+	p += oid_len;
+
+	src_op = src_ops;
+	while (src_op->op) {
+		osd_req_encode_op(req, op, src_op);
+		src_op++;
+		op++;
+	}
+
+	if (req->r_trail)
+		data_len += req->r_trail->length;
+
+	if (snapc) {
+		head->snap_seq = cpu_to_le64(snapc->seq);
+		head->num_snaps = cpu_to_le32(snapc->num_snaps);
+		for (i = 0; i < snapc->num_snaps; i++) {
+			put_unaligned_le64(snapc->snaps[i], p);
+			p += sizeof(u64);
+		}
+	}
+
+	if (flags & CEPH_OSD_FLAG_WRITE) {
+		req->r_request->hdr.data_off = cpu_to_le16(off);
+		req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
+	} else if (data_len) {
+		req->r_request->hdr.data_off = 0;
+		req->r_request->hdr.data_len = cpu_to_le32(data_len);
+	}
+
+	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+	msg_size = p - msg->front.iov_base;
+	msg->front.iov_len = msg_size;
+	msg->hdr.front_len = cpu_to_le32(msg_size);
+	return;
+}
+EXPORT_SYMBOL(ceph_osdc_build_request);
+
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately.  (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ *
+ * if @do_sync, include a 'startsync' command so that the osd will flush
+ * data quickly.
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+					       struct ceph_file_layout *layout,
+					       struct ceph_vino vino,
+					       u64 off, u64 *plen,
+					       int opcode, int flags,
+					       struct ceph_snap_context *snapc,
+					       int do_sync,
+					       u32 truncate_seq,
+					       u64 truncate_size,
+					       struct timespec *mtime,
+					       bool use_mempool, int num_reply)
+{
+	struct ceph_osd_req_op ops[3];
+	struct ceph_osd_request *req;
+
+	ops[0].op = opcode;
+	ops[0].extent.truncate_seq = truncate_seq;
+	ops[0].extent.truncate_size = truncate_size;
+	ops[0].payload_len = 0;
+
+	if (do_sync) {
+		ops[1].op = CEPH_OSD_OP_STARTSYNC;
+		ops[1].payload_len = 0;
+		ops[2].op = 0;
+	} else
+		ops[1].op = 0;
+
+	req = ceph_osdc_alloc_request(osdc, flags,
+					 snapc, ops,
+					 use_mempool,
+					 GFP_NOFS, NULL, NULL);
+	if (IS_ERR(req))
+		return req;
+
+	/* calculate max write size */
+	calc_layout(osdc, vino, layout, off, plen, req, ops);
+	req->r_file_layout = *layout;  /* keep a copy */
+
+	ceph_osdc_build_request(req, off, plen, ops,
+				snapc,
+				mtime,
+				req->r_oid, req->r_oid_len);
+
+	return req;
+}
+EXPORT_SYMBOL(ceph_osdc_new_request);
+
+/*
+ * We keep osd requests in an rbtree, sorted by ->r_tid.
+ */
+static void __insert_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *new)
+{
+	struct rb_node **p = &osdc->requests.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_osd_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_osd_request, r_node);
+		if (new->r_tid < req->r_tid)
+			p = &(*p)->rb_left;
+		else if (new->r_tid > req->r_tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->r_node, parent, p);
+	rb_insert_color(&new->r_node, &osdc->requests);
+}
+
+static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
+						 u64 tid)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *n = osdc->requests.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_osd_request, r_node);
+		if (tid < req->r_tid)
+			n = n->rb_left;
+		else if (tid > req->r_tid)
+			n = n->rb_right;
+		else
+			return req;
+	}
+	return NULL;
+}
+
+static struct ceph_osd_request *
+__lookup_request_ge(struct ceph_osd_client *osdc,
+		    u64 tid)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *n = osdc->requests.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_osd_request, r_node);
+		if (tid < req->r_tid) {
+			if (!n->rb_left)
+				return req;
+			n = n->rb_left;
+		} else if (tid > req->r_tid) {
+			n = n->rb_right;
+		} else {
+			return req;
+		}
+	}
+	return NULL;
+}
+
+
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_reset(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc;
+
+	if (!osd)
+		return;
+	dout("osd_reset osd%d\n", osd->o_osd);
+	osdc = osd->o_osdc;
+	down_read(&osdc->map_sem);
+	kick_requests(osdc, osd);
+	up_read(&osdc->map_sem);
+}
+
+/*
+ * Track open sessions with osds.
+ */
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd *osd;
+
+	osd = kzalloc(sizeof(*osd), GFP_NOFS);
+	if (!osd)
+		return NULL;
+
+	atomic_set(&osd->o_ref, 1);
+	osd->o_osdc = osdc;
+	INIT_LIST_HEAD(&osd->o_requests);
+	INIT_LIST_HEAD(&osd->o_osd_lru);
+	osd->o_incarnation = 1;
+
+	ceph_con_init(osdc->client->msgr, &osd->o_con);
+	osd->o_con.private = osd;
+	osd->o_con.ops = &osd_con_ops;
+	osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
+
+	INIT_LIST_HEAD(&osd->o_keepalive_item);
+	return osd;
+}
+
+static struct ceph_osd *get_osd(struct ceph_osd *osd)
+{
+	if (atomic_inc_not_zero(&osd->o_ref)) {
+		dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
+		     atomic_read(&osd->o_ref));
+		return osd;
+	} else {
+		dout("get_osd %p FAIL\n", osd);
+		return NULL;
+	}
+}
+
+static void put_osd(struct ceph_osd *osd)
+{
+	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
+	     atomic_read(&osd->o_ref) - 1);
+	if (atomic_dec_and_test(&osd->o_ref)) {
+		struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
+
+		if (osd->o_authorizer)
+			ac->ops->destroy_authorizer(ac, osd->o_authorizer);
+		kfree(osd);
+	}
+}
+
+/*
+ * remove an osd from our map
+ */
+static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+	dout("__remove_osd %p\n", osd);
+	BUG_ON(!list_empty(&osd->o_requests));
+	rb_erase(&osd->o_node, &osdc->osds);
+	list_del_init(&osd->o_osd_lru);
+	ceph_con_close(&osd->o_con);
+	put_osd(osd);
+}
+
+static void __move_osd_to_lru(struct ceph_osd_client *osdc,
+			      struct ceph_osd *osd)
+{
+	dout("__move_osd_to_lru %p\n", osd);
+	BUG_ON(!list_empty(&osd->o_osd_lru));
+	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
+}
+
+static void __remove_osd_from_lru(struct ceph_osd *osd)
+{
+	dout("__remove_osd_from_lru %p\n", osd);
+	if (!list_empty(&osd->o_osd_lru))
+		list_del_init(&osd->o_osd_lru);
+}
+
+static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
+{
+	struct ceph_osd *osd, *nosd;
+
+	dout("__remove_old_osds %p\n", osdc);
+	mutex_lock(&osdc->request_mutex);
+	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+		if (!remove_all && time_before(jiffies, osd->lru_ttl))
+			break;
+		__remove_osd(osdc, osd);
+	}
+	mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * reset osd connect
+ */
+static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+	struct ceph_osd_request *req;
+	int ret = 0;
+
+	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
+	if (list_empty(&osd->o_requests)) {
+		__remove_osd(osdc, osd);
+	} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
+			  &osd->o_con.peer_addr,
+			  sizeof(osd->o_con.peer_addr)) == 0 &&
+		   !ceph_con_opened(&osd->o_con)) {
+		dout(" osd addr hasn't changed and connection never opened,"
+		     " letting msgr retry");
+		/* touch each r_stamp for handle_timeout()'s benfit */
+		list_for_each_entry(req, &osd->o_requests, r_osd_item)
+			req->r_stamp = jiffies;
+		ret = -EAGAIN;
+	} else {
+		ceph_con_close(&osd->o_con);
+		ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
+		osd->o_incarnation++;
+	}
+	return ret;
+}
+
+static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+{
+	struct rb_node **p = &osdc->osds.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_osd *osd = NULL;
+
+	while (*p) {
+		parent = *p;
+		osd = rb_entry(parent, struct ceph_osd, o_node);
+		if (new->o_osd < osd->o_osd)
+			p = &(*p)->rb_left;
+		else if (new->o_osd > osd->o_osd)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->o_node, parent, p);
+	rb_insert_color(&new->o_node, &osdc->osds);
+}
+
+static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+{
+	struct ceph_osd *osd;
+	struct rb_node *n = osdc->osds.rb_node;
+
+	while (n) {
+		osd = rb_entry(n, struct ceph_osd, o_node);
+		if (o < osd->o_osd)
+			n = n->rb_left;
+		else if (o > osd->o_osd)
+			n = n->rb_right;
+		else
+			return osd;
+	}
+	return NULL;
+}
+
+static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+{
+	schedule_delayed_work(&osdc->timeout_work,
+			osdc->client->options->osd_keepalive_timeout * HZ);
+}
+
+static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+{
+	cancel_delayed_work(&osdc->timeout_work);
+}
+
+/*
+ * Register request, assign tid.  If this is the first request, set up
+ * the timeout event.
+ */
+static void register_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *req)
+{
+	mutex_lock(&osdc->request_mutex);
+	req->r_tid = ++osdc->last_tid;
+	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+	INIT_LIST_HEAD(&req->r_req_lru_item);
+
+	dout("register_request %p tid %lld\n", req, req->r_tid);
+	__insert_request(osdc, req);
+	ceph_osdc_get_request(req);
+	osdc->num_requests++;
+
+	if (osdc->num_requests == 1) {
+		dout(" first request, scheduling timeout\n");
+		__schedule_osd_timeout(osdc);
+	}
+	mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * called under osdc->request_mutex
+ */
+static void __unregister_request(struct ceph_osd_client *osdc,
+				 struct ceph_osd_request *req)
+{
+	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+	rb_erase(&req->r_node, &osdc->requests);
+	osdc->num_requests--;
+
+	if (req->r_osd) {
+		/* make sure the original request isn't in flight. */
+		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+
+		list_del_init(&req->r_osd_item);
+		if (list_empty(&req->r_osd->o_requests))
+			__move_osd_to_lru(osdc, req->r_osd);
+		req->r_osd = NULL;
+	}
+
+	ceph_osdc_put_request(req);
+
+	list_del_init(&req->r_req_lru_item);
+	if (osdc->num_requests == 0) {
+		dout(" no requests, canceling timeout\n");
+		__cancel_osd_timeout(osdc);
+	}
+}
+
+/*
+ * Cancel a previously queued request message
+ */
+static void __cancel_request(struct ceph_osd_request *req)
+{
+	if (req->r_sent && req->r_osd) {
+		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+		req->r_sent = 0;
+	}
+	list_del_init(&req->r_req_lru_item);
+}
+
+/*
+ * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
+ * (as needed), and set the request r_osd appropriately.  If there is
+ * no up osd, set r_osd to NULL.
+ *
+ * Return 0 if unchanged, 1 if changed, or negative on error.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static int __map_osds(struct ceph_osd_client *osdc,
+		      struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+	struct ceph_pg pgid;
+	int acting[CEPH_PG_MAX_SIZE];
+	int o = -1, num = 0;
+	int err;
+
+	dout("map_osds %p tid %lld\n", req, req->r_tid);
+	err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
+				      &req->r_file_layout, osdc->osdmap);
+	if (err)
+		return err;
+	pgid = reqhead->layout.ol_pgid;
+	req->r_pgid = pgid;
+
+	err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
+	if (err > 0) {
+		o = acting[0];
+		num = err;
+	}
+
+	if ((req->r_osd && req->r_osd->o_osd == o &&
+	     req->r_sent >= req->r_osd->o_incarnation &&
+	     req->r_num_pg_osds == num &&
+	     memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
+	    (req->r_osd == NULL && o == -1))
+		return 0;  /* no change */
+
+	dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
+	     req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
+	     req->r_osd ? req->r_osd->o_osd : -1);
+
+	/* record full pg acting set */
+	memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
+	req->r_num_pg_osds = num;
+
+	if (req->r_osd) {
+		__cancel_request(req);
+		list_del_init(&req->r_osd_item);
+		req->r_osd = NULL;
+	}
+
+	req->r_osd = __lookup_osd(osdc, o);
+	if (!req->r_osd && o >= 0) {
+		err = -ENOMEM;
+		req->r_osd = create_osd(osdc);
+		if (!req->r_osd)
+			goto out;
+
+		dout("map_osds osd %p is osd%d\n", req->r_osd, o);
+		req->r_osd->o_osd = o;
+		req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
+		__insert_osd(osdc, req->r_osd);
+
+		ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
+	}
+
+	if (req->r_osd) {
+		__remove_osd_from_lru(req->r_osd);
+		list_add(&req->r_osd_item, &req->r_osd->o_requests);
+	}
+	err = 1;   /* osd or pg changed */
+
+out:
+	return err;
+}
+
+/*
+ * caller should hold map_sem (for read) and request_mutex
+ */
+static int __send_request(struct ceph_osd_client *osdc,
+			  struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *reqhead;
+	int err;
+
+	err = __map_osds(osdc, req);
+	if (err < 0)
+		return err;
+	if (req->r_osd == NULL) {
+		dout("send_request %p no up osds in pg\n", req);
+		ceph_monc_request_next_osdmap(&osdc->client->monc);
+		return 0;
+	}
+
+	dout("send_request %p tid %llu to osd%d flags %d\n",
+	     req, req->r_tid, req->r_osd->o_osd, req->r_flags);
+
+	reqhead = req->r_request->front.iov_base;
+	reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
+	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
+	reqhead->reassert_version = req->r_reassert_version;
+
+	req->r_stamp = jiffies;
+	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+
+	ceph_msg_get(req->r_request); /* send consumes a ref */
+	ceph_con_send(&req->r_osd->o_con, req->r_request);
+	req->r_sent = req->r_osd->o_incarnation;
+	return 0;
+}
+
+/*
+ * Timeout callback, called every N seconds when 1 or more osd
+ * requests has been active for more than N seconds.  When this
+ * happens, we ping all OSDs with requests who have timed out to
+ * ensure any communications channel reset is detected.  Reset the
+ * request timeouts another N seconds in the future as we go.
+ * Reschedule the timeout event another N seconds in future (unless
+ * there are no open requests).
+ */
+static void handle_timeout(struct work_struct *work)
+{
+	struct ceph_osd_client *osdc =
+		container_of(work, struct ceph_osd_client, timeout_work.work);
+	struct ceph_osd_request *req, *last_req = NULL;
+	struct ceph_osd *osd;
+	unsigned long timeout = osdc->client->options->osd_timeout * HZ;
+	unsigned long keepalive =
+		osdc->client->options->osd_keepalive_timeout * HZ;
+	unsigned long last_stamp = 0;
+	struct rb_node *p;
+	struct list_head slow_osds;
+
+	dout("timeout\n");
+	down_read(&osdc->map_sem);
+
+	ceph_monc_request_next_osdmap(&osdc->client->monc);
+
+	mutex_lock(&osdc->request_mutex);
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		if (req->r_resend) {
+			int err;
+
+			dout("osdc resending prev failed %lld\n", req->r_tid);
+			err = __send_request(osdc, req);
+			if (err)
+				dout("osdc failed again on %lld\n", req->r_tid);
+			else
+				req->r_resend = false;
+			continue;
+		}
+	}
+
+	/*
+	 * reset osds that appear to be _really_ unresponsive.  this
+	 * is a failsafe measure.. we really shouldn't be getting to
+	 * this point if the system is working properly.  the monitors
+	 * should mark the osd as failed and we should find out about
+	 * it from an updated osd map.
+	 */
+	while (timeout && !list_empty(&osdc->req_lru)) {
+		req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
+				 r_req_lru_item);
+
+		if (time_before(jiffies, req->r_stamp + timeout))
+			break;
+
+		BUG_ON(req == last_req && req->r_stamp == last_stamp);
+		last_req = req;
+		last_stamp = req->r_stamp;
+
+		osd = req->r_osd;
+		BUG_ON(!osd);
+		pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
+			   req->r_tid, osd->o_osd);
+		__kick_requests(osdc, osd);
+	}
+
+	/*
+	 * ping osds that are a bit slow.  this ensures that if there
+	 * is a break in the TCP connection we will notice, and reopen
+	 * a connection with that osd (from the fault callback).
+	 */
+	INIT_LIST_HEAD(&slow_osds);
+	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
+		if (time_before(jiffies, req->r_stamp + keepalive))
+			break;
+
+		osd = req->r_osd;
+		BUG_ON(!osd);
+		dout(" tid %llu is slow, will send keepalive on osd%d\n",
+		     req->r_tid, osd->o_osd);
+		list_move_tail(&osd->o_keepalive_item, &slow_osds);
+	}
+	while (!list_empty(&slow_osds)) {
+		osd = list_entry(slow_osds.next, struct ceph_osd,
+				 o_keepalive_item);
+		list_del_init(&osd->o_keepalive_item);
+		ceph_con_keepalive(&osd->o_con);
+	}
+
+	__schedule_osd_timeout(osdc);
+	mutex_unlock(&osdc->request_mutex);
+
+	up_read(&osdc->map_sem);
+}
+
+static void handle_osds_timeout(struct work_struct *work)
+{
+	struct ceph_osd_client *osdc =
+		container_of(work, struct ceph_osd_client,
+			     osds_timeout_work.work);
+	unsigned long delay =
+		osdc->client->options->osd_idle_ttl * HZ >> 2;
+
+	dout("osds timeout\n");
+	down_read(&osdc->map_sem);
+	remove_old_osds(osdc, 0);
+	up_read(&osdc->map_sem);
+
+	schedule_delayed_work(&osdc->osds_timeout_work,
+			      round_jiffies_relative(delay));
+}
+
+/*
+ * handle osd op reply.  either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ */
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
+			 struct ceph_connection *con)
+{
+	struct ceph_osd_reply_head *rhead = msg->front.iov_base;
+	struct ceph_osd_request *req;
+	u64 tid;
+	int numops, object_len, flags;
+	s32 result;
+
+	tid = le64_to_cpu(msg->hdr.tid);
+	if (msg->front.iov_len < sizeof(*rhead))
+		goto bad;
+	numops = le32_to_cpu(rhead->num_ops);
+	object_len = le32_to_cpu(rhead->object_len);
+	result = le32_to_cpu(rhead->result);
+	if (msg->front.iov_len != sizeof(*rhead) + object_len +
+	    numops * sizeof(struct ceph_osd_op))
+		goto bad;
+	dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
+
+	/* lookup */
+	mutex_lock(&osdc->request_mutex);
+	req = __lookup_request(osdc, tid);
+	if (req == NULL) {
+		dout("handle_reply tid %llu dne\n", tid);
+		mutex_unlock(&osdc->request_mutex);
+		return;
+	}
+	ceph_osdc_get_request(req);
+	flags = le32_to_cpu(rhead->flags);
+
+	/*
+	 * if this connection filled our message, drop our reference now, to
+	 * avoid a (safe but slower) revoke later.
+	 */
+	if (req->r_con_filling_msg == con && req->r_reply == msg) {
+		dout(" dropping con_filling_msg ref %p\n", con);
+		req->r_con_filling_msg = NULL;
+		ceph_con_put(con);
+	}
+
+	if (!req->r_got_reply) {
+		unsigned bytes;
+
+		req->r_result = le32_to_cpu(rhead->result);
+		bytes = le32_to_cpu(msg->hdr.data_len);
+		dout("handle_reply result %d bytes %d\n", req->r_result,
+		     bytes);
+		if (req->r_result == 0)
+			req->r_result = bytes;
+
+		/* in case this is a write and we need to replay, */
+		req->r_reassert_version = rhead->reassert_version;
+
+		req->r_got_reply = 1;
+	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
+		dout("handle_reply tid %llu dup ack\n", tid);
+		mutex_unlock(&osdc->request_mutex);
+		goto done;
+	}
+
+	dout("handle_reply tid %llu flags %d\n", tid, flags);
+
+	/* either this is a read, or we got the safe response */
+	if (result < 0 ||
+	    (flags & CEPH_OSD_FLAG_ONDISK) ||
+	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+		__unregister_request(osdc, req);
+
+	mutex_unlock(&osdc->request_mutex);
+
+	if (req->r_callback)
+		req->r_callback(req, msg);
+	else
+		complete_all(&req->r_completion);
+
+	if (flags & CEPH_OSD_FLAG_ONDISK) {
+		if (req->r_safe_callback)
+			req->r_safe_callback(req, msg);
+		complete_all(&req->r_safe_completion);  /* fsync waiter */
+	}
+
+done:
+	ceph_osdc_put_request(req);
+	return;
+
+bad:
+	pr_err("corrupt osd_op_reply got %d %d expected %d\n",
+	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
+	       (int)sizeof(*rhead));
+	ceph_msg_dump(msg);
+}
+
+
+static int __kick_requests(struct ceph_osd_client *osdc,
+			  struct ceph_osd *kickosd)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *p, *n;
+	int needmap = 0;
+	int err;
+
+	dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
+	if (kickosd) {
+		err = __reset_osd(osdc, kickosd);
+		if (err == -EAGAIN)
+			return 1;
+	} else {
+		for (p = rb_first(&osdc->osds); p; p = n) {
+			struct ceph_osd *osd =
+				rb_entry(p, struct ceph_osd, o_node);
+
+			n = rb_next(p);
+			if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+			    memcmp(&osd->o_con.peer_addr,
+				   ceph_osd_addr(osdc->osdmap,
+						 osd->o_osd),
+				   sizeof(struct ceph_entity_addr)) != 0)
+				__reset_osd(osdc, osd);
+		}
+	}
+
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		if (req->r_resend) {
+			dout(" r_resend set on tid %llu\n", req->r_tid);
+			__cancel_request(req);
+			goto kick;
+		}
+		if (req->r_osd && kickosd == req->r_osd) {
+			__cancel_request(req);
+			goto kick;
+		}
+
+		err = __map_osds(osdc, req);
+		if (err == 0)
+			continue;  /* no change */
+		if (err < 0) {
+			/*
+			 * FIXME: really, we should set the request
+			 * error and fail if this isn't a 'nofail'
+			 * request, but that's a fair bit more
+			 * complicated to do.  So retry!
+			 */
+			dout(" setting r_resend on %llu\n", req->r_tid);
+			req->r_resend = true;
+			continue;
+		}
+		if (req->r_osd == NULL) {
+			dout("tid %llu maps to no valid osd\n", req->r_tid);
+			needmap++;  /* request a newer map */
+			continue;
+		}
+
+kick:
+		dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
+		     req->r_osd ? req->r_osd->o_osd : -1);
+		req->r_flags |= CEPH_OSD_FLAG_RETRY;
+		err = __send_request(osdc, req);
+		if (err) {
+			dout(" setting r_resend on %llu\n", req->r_tid);
+			req->r_resend = true;
+		}
+	}
+
+	return needmap;
+}
+
+/*
+ * Resubmit osd requests whose osd or osd address has changed.  Request
+ * a new osd map if osds are down, or we are otherwise unable to determine
+ * how to direct a request.
+ *
+ * Close connections to down osds.
+ *
+ * If @who is specified, resubmit requests for that specific osd.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static void kick_requests(struct ceph_osd_client *osdc,
+			  struct ceph_osd *kickosd)
+{
+	int needmap;
+
+	mutex_lock(&osdc->request_mutex);
+	needmap = __kick_requests(osdc, kickosd);
+	mutex_unlock(&osdc->request_mutex);
+
+	if (needmap) {
+		dout("%d requests for down osds, need new map\n", needmap);
+		ceph_monc_request_next_osdmap(&osdc->client->monc);
+	}
+
+}
+/*
+ * Process updated osd map.
+ *
+ * The message contains any number of incremental and full maps, normally
+ * indicating some sort of topology change in the cluster.  Kick requests
+ * off to different OSDs as needed.
+ */
+void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+	void *p, *end, *next;
+	u32 nr_maps, maplen;
+	u32 epoch;
+	struct ceph_osdmap *newmap = NULL, *oldmap;
+	int err;
+	struct ceph_fsid fsid;
+
+	dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+	p = msg->front.iov_base;
+	end = p + msg->front.iov_len;
+
+	/* verify fsid */
+	ceph_decode_need(&p, end, sizeof(fsid), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	if (ceph_check_fsid(osdc->client, &fsid) < 0)
+		return;
+
+	down_write(&osdc->map_sem);
+
+	/* incremental maps */
+	ceph_decode_32_safe(&p, end, nr_maps, bad);
+	dout(" %d inc maps\n", nr_maps);
+	while (nr_maps > 0) {
+		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+		epoch = ceph_decode_32(&p);
+		maplen = ceph_decode_32(&p);
+		ceph_decode_need(&p, end, maplen, bad);
+		next = p + maplen;
+		if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+			dout("applying incremental map %u len %d\n",
+			     epoch, maplen);
+			newmap = osdmap_apply_incremental(&p, next,
+							  osdc->osdmap,
+							  osdc->client->msgr);
+			if (IS_ERR(newmap)) {
+				err = PTR_ERR(newmap);
+				goto bad;
+			}
+			BUG_ON(!newmap);
+			if (newmap != osdc->osdmap) {
+				ceph_osdmap_destroy(osdc->osdmap);
+				osdc->osdmap = newmap;
+			}
+		} else {
+			dout("ignoring incremental map %u len %d\n",
+			     epoch, maplen);
+		}
+		p = next;
+		nr_maps--;
+	}
+	if (newmap)
+		goto done;
+
+	/* full maps */
+	ceph_decode_32_safe(&p, end, nr_maps, bad);
+	dout(" %d full maps\n", nr_maps);
+	while (nr_maps) {
+		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+		epoch = ceph_decode_32(&p);
+		maplen = ceph_decode_32(&p);
+		ceph_decode_need(&p, end, maplen, bad);
+		if (nr_maps > 1) {
+			dout("skipping non-latest full map %u len %d\n",
+			     epoch, maplen);
+		} else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+			dout("skipping full map %u len %d, "
+			     "older than our %u\n", epoch, maplen,
+			     osdc->osdmap->epoch);
+		} else {
+			dout("taking full map %u len %d\n", epoch, maplen);
+			newmap = osdmap_decode(&p, p+maplen);
+			if (IS_ERR(newmap)) {
+				err = PTR_ERR(newmap);
+				goto bad;
+			}
+			BUG_ON(!newmap);
+			oldmap = osdc->osdmap;
+			osdc->osdmap = newmap;
+			if (oldmap)
+				ceph_osdmap_destroy(oldmap);
+		}
+		p += maplen;
+		nr_maps--;
+	}
+
+done:
+	downgrade_write(&osdc->map_sem);
+	ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
+	if (newmap)
+		kick_requests(osdc, NULL);
+	up_read(&osdc->map_sem);
+	wake_up_all(&osdc->client->auth_wq);
+	return;
+
+bad:
+	pr_err("osdc handle_map corrupt msg\n");
+	ceph_msg_dump(msg);
+	up_write(&osdc->map_sem);
+	return;
+}
+
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+			    struct ceph_osd_request *req,
+			    bool nofail)
+{
+	int rc = 0;
+
+	req->r_request->pages = req->r_pages;
+	req->r_request->nr_pages = req->r_num_pages;
+#ifdef CONFIG_BLOCK
+	req->r_request->bio = req->r_bio;
+#endif
+	req->r_request->trail = req->r_trail;
+
+	register_request(osdc, req);
+
+	down_read(&osdc->map_sem);
+	mutex_lock(&osdc->request_mutex);
+	/*
+	 * a racing kick_requests() may have sent the message for us
+	 * while we dropped request_mutex above, so only send now if
+	 * the request still han't been touched yet.
+	 */
+	if (req->r_sent == 0) {
+		rc = __send_request(osdc, req);
+		if (rc) {
+			if (nofail) {
+				dout("osdc_start_request failed send, "
+				     " marking %lld\n", req->r_tid);
+				req->r_resend = true;
+				rc = 0;
+			} else {
+				__unregister_request(osdc, req);
+			}
+		}
+	}
+	mutex_unlock(&osdc->request_mutex);
+	up_read(&osdc->map_sem);
+	return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_start_request);
+
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+			   struct ceph_osd_request *req)
+{
+	int rc;
+
+	rc = wait_for_completion_interruptible(&req->r_completion);
+	if (rc < 0) {
+		mutex_lock(&osdc->request_mutex);
+		__cancel_request(req);
+		__unregister_request(osdc, req);
+		mutex_unlock(&osdc->request_mutex);
+		dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
+		return rc;
+	}
+
+	dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
+	return req->r_result;
+}
+EXPORT_SYMBOL(ceph_osdc_wait_request);
+
+/*
+ * sync - wait for all in-flight requests to flush.  avoid starvation.
+ */
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd_request *req;
+	u64 last_tid, next_tid = 0;
+
+	mutex_lock(&osdc->request_mutex);
+	last_tid = osdc->last_tid;
+	while (1) {
+		req = __lookup_request_ge(osdc, next_tid);
+		if (!req)
+			break;
+		if (req->r_tid > last_tid)
+			break;
+
+		next_tid = req->r_tid + 1;
+		if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
+			continue;
+
+		ceph_osdc_get_request(req);
+		mutex_unlock(&osdc->request_mutex);
+		dout("sync waiting on tid %llu (last is %llu)\n",
+		     req->r_tid, last_tid);
+		wait_for_completion(&req->r_safe_completion);
+		mutex_lock(&osdc->request_mutex);
+		ceph_osdc_put_request(req);
+	}
+	mutex_unlock(&osdc->request_mutex);
+	dout("sync done (thru tid %llu)\n", last_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_sync);
+
+/*
+ * init, shutdown
+ */
+int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
+{
+	int err;
+
+	dout("init\n");
+	osdc->client = client;
+	osdc->osdmap = NULL;
+	init_rwsem(&osdc->map_sem);
+	init_completion(&osdc->map_waiters);
+	osdc->last_requested_map = 0;
+	mutex_init(&osdc->request_mutex);
+	osdc->last_tid = 0;
+	osdc->osds = RB_ROOT;
+	INIT_LIST_HEAD(&osdc->osd_lru);
+	osdc->requests = RB_ROOT;
+	INIT_LIST_HEAD(&osdc->req_lru);
+	osdc->num_requests = 0;
+	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
+	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
+
+	schedule_delayed_work(&osdc->osds_timeout_work,
+	   round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
+
+	err = -ENOMEM;
+	osdc->req_mempool = mempool_create_kmalloc_pool(10,
+					sizeof(struct ceph_osd_request));
+	if (!osdc->req_mempool)
+		goto out;
+
+	err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+				"osd_op");
+	if (err < 0)
+		goto out_mempool;
+	err = ceph_msgpool_init(&osdc->msgpool_op_reply,
+				OSD_OPREPLY_FRONT_LEN, 10, true,
+				"osd_op_reply");
+	if (err < 0)
+		goto out_msgpool;
+	return 0;
+
+out_msgpool:
+	ceph_msgpool_destroy(&osdc->msgpool_op);
+out_mempool:
+	mempool_destroy(osdc->req_mempool);
+out:
+	return err;
+}
+EXPORT_SYMBOL(ceph_osdc_init);
+
+void ceph_osdc_stop(struct ceph_osd_client *osdc)
+{
+	cancel_delayed_work_sync(&osdc->timeout_work);
+	cancel_delayed_work_sync(&osdc->osds_timeout_work);
+	if (osdc->osdmap) {
+		ceph_osdmap_destroy(osdc->osdmap);
+		osdc->osdmap = NULL;
+	}
+	remove_old_osds(osdc, 1);
+	mempool_destroy(osdc->req_mempool);
+	ceph_msgpool_destroy(&osdc->msgpool_op);
+	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+}
+EXPORT_SYMBOL(ceph_osdc_stop);
+
+/*
+ * Read some contiguous pages.  If we cross a stripe boundary, shorten
+ * *plen.  Return number of bytes read, or error.
+ */
+int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+			struct ceph_vino vino, struct ceph_file_layout *layout,
+			u64 off, u64 *plen,
+			u32 truncate_seq, u64 truncate_size,
+			struct page **pages, int num_pages)
+{
+	struct ceph_osd_request *req;
+	int rc = 0;
+
+	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+	     vino.snap, off, *plen);
+	req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
+				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+				    NULL, 0, truncate_seq, truncate_size, NULL,
+				    false, 1);
+	if (!req)
+		return -ENOMEM;
+
+	/* it may be a short read due to an object boundary */
+	req->r_pages = pages;
+
+	dout("readpages  final extent is %llu~%llu (%d pages)\n",
+	     off, *plen, req->r_num_pages);
+
+	rc = ceph_osdc_start_request(osdc, req, false);
+	if (!rc)
+		rc = ceph_osdc_wait_request(osdc, req);
+
+	ceph_osdc_put_request(req);
+	dout("readpages result %d\n", rc);
+	return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_readpages);
+
+/*
+ * do a synchronous write on N pages
+ */
+int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
+			 struct ceph_file_layout *layout,
+			 struct ceph_snap_context *snapc,
+			 u64 off, u64 len,
+			 u32 truncate_seq, u64 truncate_size,
+			 struct timespec *mtime,
+			 struct page **pages, int num_pages,
+			 int flags, int do_sync, bool nofail)
+{
+	struct ceph_osd_request *req;
+	int rc = 0;
+
+	BUG_ON(vino.snap != CEPH_NOSNAP);
+	req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
+				    CEPH_OSD_OP_WRITE,
+				    flags | CEPH_OSD_FLAG_ONDISK |
+					    CEPH_OSD_FLAG_WRITE,
+				    snapc, do_sync,
+				    truncate_seq, truncate_size, mtime,
+				    nofail, 1);
+	if (!req)
+		return -ENOMEM;
+
+	/* it may be a short write due to an object boundary */
+	req->r_pages = pages;
+	dout("writepages %llu~%llu (%d pages)\n", off, len,
+	     req->r_num_pages);
+
+	rc = ceph_osdc_start_request(osdc, req, nofail);
+	if (!rc)
+		rc = ceph_osdc_wait_request(osdc, req);
+
+	ceph_osdc_put_request(req);
+	if (rc == 0)
+		rc = len;
+	dout("writepages result %d\n", rc);
+	return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_writepages);
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	if (!osd)
+		goto out;
+	osdc = osd->o_osdc;
+
+	switch (type) {
+	case CEPH_MSG_OSD_MAP:
+		ceph_osdc_handle_map(osdc, msg);
+		break;
+	case CEPH_MSG_OSD_OPREPLY:
+		handle_reply(osdc, msg, con);
+		break;
+
+	default:
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+out:
+	ceph_msg_put(msg);
+}
+
+/*
+ * lookup and return message for incoming reply.  set up reply message
+ * pages.
+ */
+static struct ceph_msg *get_reply(struct ceph_connection *con,
+				  struct ceph_msg_header *hdr,
+				  int *skip)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	struct ceph_msg *m;
+	struct ceph_osd_request *req;
+	int front = le32_to_cpu(hdr->front_len);
+	int data_len = le32_to_cpu(hdr->data_len);
+	u64 tid;
+
+	tid = le64_to_cpu(hdr->tid);
+	mutex_lock(&osdc->request_mutex);
+	req = __lookup_request(osdc, tid);
+	if (!req) {
+		*skip = 1;
+		m = NULL;
+		pr_info("get_reply unknown tid %llu from osd%d\n", tid,
+			osd->o_osd);
+		goto out;
+	}
+
+	if (req->r_con_filling_msg) {
+		dout("get_reply revoking msg %p from old con %p\n",
+		     req->r_reply, req->r_con_filling_msg);
+		ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
+		ceph_con_put(req->r_con_filling_msg);
+		req->r_con_filling_msg = NULL;
+	}
+
+	if (front > req->r_reply->front.iov_len) {
+		pr_warning("get_reply front %d > preallocated %d\n",
+			   front, (int)req->r_reply->front.iov_len);
+		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
+		if (!m)
+			goto out;
+		ceph_msg_put(req->r_reply);
+		req->r_reply = m;
+	}
+	m = ceph_msg_get(req->r_reply);
+
+	if (data_len > 0) {
+		unsigned data_off = le16_to_cpu(hdr->data_off);
+		int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+
+		if (unlikely(req->r_num_pages < want)) {
+			pr_warning("tid %lld reply %d > expected %d pages\n",
+				   tid, want, m->nr_pages);
+			*skip = 1;
+			ceph_msg_put(m);
+			m = NULL;
+			goto out;
+		}
+		m->pages = req->r_pages;
+		m->nr_pages = req->r_num_pages;
+#ifdef CONFIG_BLOCK
+		m->bio = req->r_bio;
+#endif
+	}
+	*skip = 0;
+	req->r_con_filling_msg = ceph_con_get(con);
+	dout("get_reply tid %lld %p\n", tid, m);
+
+out:
+	mutex_unlock(&osdc->request_mutex);
+	return m;
+
+}
+
+static struct ceph_msg *alloc_msg(struct ceph_connection *con,
+				  struct ceph_msg_header *hdr,
+				  int *skip)
+{
+	struct ceph_osd *osd = con->private;
+	int type = le16_to_cpu(hdr->type);
+	int front = le32_to_cpu(hdr->front_len);
+
+	switch (type) {
+	case CEPH_MSG_OSD_MAP:
+		return ceph_msg_new(type, front, GFP_NOFS);
+	case CEPH_MSG_OSD_OPREPLY:
+		return get_reply(con, hdr, skip);
+	default:
+		pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
+			osd->o_osd);
+		*skip = 1;
+		return NULL;
+	}
+}
+
+/*
+ * Wrappers to refcount containing ceph_osd struct
+ */
+static struct ceph_connection *get_osd_con(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	if (get_osd(osd))
+		return con;
+	return NULL;
+}
+
+static void put_osd_con(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	put_osd(osd);
+}
+
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+			  void **buf, int *len, int *proto,
+			  void **reply_buf, int *reply_len, int force_new)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+	int ret = 0;
+
+	if (force_new && o->o_authorizer) {
+		ac->ops->destroy_authorizer(ac, o->o_authorizer);
+		o->o_authorizer = NULL;
+	}
+	if (o->o_authorizer == NULL) {
+		ret = ac->ops->create_authorizer(
+			ac, CEPH_ENTITY_TYPE_OSD,
+			&o->o_authorizer,
+			&o->o_authorizer_buf,
+			&o->o_authorizer_buf_len,
+			&o->o_authorizer_reply_buf,
+			&o->o_authorizer_reply_buf_len);
+		if (ret)
+			return ret;
+	}
+
+	*proto = ac->protocol;
+	*buf = o->o_authorizer_buf;
+	*len = o->o_authorizer_buf_len;
+	*reply_buf = o->o_authorizer_reply_buf;
+	*reply_len = o->o_authorizer_reply_buf_len;
+	return 0;
+}
+
+
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+	return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
+}
+
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+	if (ac->ops->invalidate_authorizer)
+		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
+
+	return ceph_monc_validate_auth(&osdc->client->monc);
+}
+
+static const struct ceph_connection_operations osd_con_ops = {
+	.get = get_osd_con,
+	.put = put_osd_con,
+	.dispatch = dispatch,
+	.get_authorizer = get_authorizer,
+	.verify_authorizer_reply = verify_authorizer_reply,
+	.invalidate_authorizer = invalidate_authorizer,
+	.alloc_msg = alloc_msg,
+	.fault = osd_reset,
+};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
new file mode 100644
index 00000000000..d73f3f6efa3
--- /dev/null
+++ b/net/ceph/osdmap.c
@@ -0,0 +1,1128 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/div64.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osdmap.h>
+#include <linux/ceph/decode.h>
+#include <linux/crush/hash.h>
+#include <linux/crush/mapper.h>
+
+char *ceph_osdmap_state_str(char *str, int len, int state)
+{
+	int flag = 0;
+
+	if (!len)
+		goto done;
+
+	*str = '\0';
+	if (state) {
+		if (state & CEPH_OSD_EXISTS) {
+			snprintf(str, len, "exists");
+			flag = 1;
+		}
+		if (state & CEPH_OSD_UP) {
+			snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
+				 "up");
+			flag = 1;
+		}
+	} else {
+		snprintf(str, len, "doesn't exist");
+	}
+done:
+	return str;
+}
+
+/* maps */
+
+static int calc_bits_of(unsigned t)
+{
+	int b = 0;
+	while (t) {
+		t = t >> 1;
+		b++;
+	}
+	return b;
+}
+
+/*
+ * the foo_mask is the smallest value 2^n-1 that is >= foo.
+ */
+static void calc_pg_masks(struct ceph_pg_pool_info *pi)
+{
+	pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
+	pi->pgp_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
+	pi->lpg_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
+	pi->lpgp_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
+}
+
+/*
+ * decode crush map
+ */
+static int crush_decode_uniform_bucket(void **p, void *end,
+				       struct crush_bucket_uniform *b)
+{
+	dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
+	ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
+	b->item_weight = ceph_decode_32(p);
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_list_bucket(void **p, void *end,
+				    struct crush_bucket_list *b)
+{
+	int j;
+	dout("crush_decode_list_bucket %p to %p\n", *p, end);
+	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->item_weights == NULL)
+		return -ENOMEM;
+	b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->sum_weights == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+	for (j = 0; j < b->h.size; j++) {
+		b->item_weights[j] = ceph_decode_32(p);
+		b->sum_weights[j] = ceph_decode_32(p);
+	}
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_tree_bucket(void **p, void *end,
+				    struct crush_bucket_tree *b)
+{
+	int j;
+	dout("crush_decode_tree_bucket %p to %p\n", *p, end);
+	ceph_decode_32_safe(p, end, b->num_nodes, bad);
+	b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
+	if (b->node_weights == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
+	for (j = 0; j < b->num_nodes; j++)
+		b->node_weights[j] = ceph_decode_32(p);
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_straw_bucket(void **p, void *end,
+				     struct crush_bucket_straw *b)
+{
+	int j;
+	dout("crush_decode_straw_bucket %p to %p\n", *p, end);
+	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->item_weights == NULL)
+		return -ENOMEM;
+	b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->straws == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+	for (j = 0; j < b->h.size; j++) {
+		b->item_weights[j] = ceph_decode_32(p);
+		b->straws[j] = ceph_decode_32(p);
+	}
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static struct crush_map *crush_decode(void *pbyval, void *end)
+{
+	struct crush_map *c;
+	int err = -EINVAL;
+	int i, j;
+	void **p = &pbyval;
+	void *start = pbyval;
+	u32 magic;
+
+	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+	c = kzalloc(sizeof(*c), GFP_NOFS);
+	if (c == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ceph_decode_need(p, end, 4*sizeof(u32), bad);
+	magic = ceph_decode_32(p);
+	if (magic != CRUSH_MAGIC) {
+		pr_err("crush_decode magic %x != current %x\n",
+		       (unsigned)magic, (unsigned)CRUSH_MAGIC);
+		goto bad;
+	}
+	c->max_buckets = ceph_decode_32(p);
+	c->max_rules = ceph_decode_32(p);
+	c->max_devices = ceph_decode_32(p);
+
+	c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
+	if (c->device_parents == NULL)
+		goto badmem;
+	c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
+	if (c->bucket_parents == NULL)
+		goto badmem;
+
+	c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
+	if (c->buckets == NULL)
+		goto badmem;
+	c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
+	if (c->rules == NULL)
+		goto badmem;
+
+	/* buckets */
+	for (i = 0; i < c->max_buckets; i++) {
+		int size = 0;
+		u32 alg;
+		struct crush_bucket *b;
+
+		ceph_decode_32_safe(p, end, alg, bad);
+		if (alg == 0) {
+			c->buckets[i] = NULL;
+			continue;
+		}
+		dout("crush_decode bucket %d off %x %p to %p\n",
+		     i, (int)(*p-start), *p, end);
+
+		switch (alg) {
+		case CRUSH_BUCKET_UNIFORM:
+			size = sizeof(struct crush_bucket_uniform);
+			break;
+		case CRUSH_BUCKET_LIST:
+			size = sizeof(struct crush_bucket_list);
+			break;
+		case CRUSH_BUCKET_TREE:
+			size = sizeof(struct crush_bucket_tree);
+			break;
+		case CRUSH_BUCKET_STRAW:
+			size = sizeof(struct crush_bucket_straw);
+			break;
+		default:
+			err = -EINVAL;
+			goto bad;
+		}
+		BUG_ON(size == 0);
+		b = c->buckets[i] = kzalloc(size, GFP_NOFS);
+		if (b == NULL)
+			goto badmem;
+
+		ceph_decode_need(p, end, 4*sizeof(u32), bad);
+		b->id = ceph_decode_32(p);
+		b->type = ceph_decode_16(p);
+		b->alg = ceph_decode_8(p);
+		b->hash = ceph_decode_8(p);
+		b->weight = ceph_decode_32(p);
+		b->size = ceph_decode_32(p);
+
+		dout("crush_decode bucket size %d off %x %p to %p\n",
+		     b->size, (int)(*p-start), *p, end);
+
+		b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
+		if (b->items == NULL)
+			goto badmem;
+		b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
+		if (b->perm == NULL)
+			goto badmem;
+		b->perm_n = 0;
+
+		ceph_decode_need(p, end, b->size*sizeof(u32), bad);
+		for (j = 0; j < b->size; j++)
+			b->items[j] = ceph_decode_32(p);
+
+		switch (b->alg) {
+		case CRUSH_BUCKET_UNIFORM:
+			err = crush_decode_uniform_bucket(p, end,
+				  (struct crush_bucket_uniform *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_LIST:
+			err = crush_decode_list_bucket(p, end,
+			       (struct crush_bucket_list *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_TREE:
+			err = crush_decode_tree_bucket(p, end,
+				(struct crush_bucket_tree *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_STRAW:
+			err = crush_decode_straw_bucket(p, end,
+				(struct crush_bucket_straw *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		}
+	}
+
+	/* rules */
+	dout("rule vec is %p\n", c->rules);
+	for (i = 0; i < c->max_rules; i++) {
+		u32 yes;
+		struct crush_rule *r;
+
+		ceph_decode_32_safe(p, end, yes, bad);
+		if (!yes) {
+			dout("crush_decode NO rule %d off %x %p to %p\n",
+			     i, (int)(*p-start), *p, end);
+			c->rules[i] = NULL;
+			continue;
+		}
+
+		dout("crush_decode rule %d off %x %p to %p\n",
+		     i, (int)(*p-start), *p, end);
+
+		/* len */
+		ceph_decode_32_safe(p, end, yes, bad);
+#if BITS_PER_LONG == 32
+		err = -EINVAL;
+		if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
+			goto bad;
+#endif
+		r = c->rules[i] = kmalloc(sizeof(*r) +
+					  yes*sizeof(struct crush_rule_step),
+					  GFP_NOFS);
+		if (r == NULL)
+			goto badmem;
+		dout(" rule %d is at %p\n", i, r);
+		r->len = yes;
+		ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
+		ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
+		for (j = 0; j < r->len; j++) {
+			r->steps[j].op = ceph_decode_32(p);
+			r->steps[j].arg1 = ceph_decode_32(p);
+			r->steps[j].arg2 = ceph_decode_32(p);
+		}
+	}
+
+	/* ignore trailing name maps. */
+
+	dout("crush_decode success\n");
+	return c;
+
+badmem:
+	err = -ENOMEM;
+bad:
+	dout("crush_decode fail %d\n", err);
+	crush_destroy(c);
+	return ERR_PTR(err);
+}
+
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds)
+ */
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+{
+	u64 a = *(u64 *)&l;
+	u64 b = *(u64 *)&r;
+
+	if (a < b)
+		return -1;
+	if (a > b)
+		return 1;
+	return 0;
+}
+
+static int __insert_pg_mapping(struct ceph_pg_mapping *new,
+			       struct rb_root *root)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_pg_mapping *pg = NULL;
+	int c;
+
+	while (*p) {
+		parent = *p;
+		pg = rb_entry(parent, struct ceph_pg_mapping, node);
+		c = pgid_cmp(new->pgid, pg->pgid);
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+	return 0;
+}
+
+static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
+						   struct ceph_pg pgid)
+{
+	struct rb_node *n = root->rb_node;
+	struct ceph_pg_mapping *pg;
+	int c;
+
+	while (n) {
+		pg = rb_entry(n, struct ceph_pg_mapping, node);
+		c = pgid_cmp(pgid, pg->pgid);
+		if (c < 0)
+			n = n->rb_left;
+		else if (c > 0)
+			n = n->rb_right;
+		else
+			return pg;
+	}
+	return NULL;
+}
+
+/*
+ * rbtree of pg pool info
+ */
+static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_pg_pool_info *pi = NULL;
+
+	while (*p) {
+		parent = *p;
+		pi = rb_entry(parent, struct ceph_pg_pool_info, node);
+		if (new->id < pi->id)
+			p = &(*p)->rb_left;
+		else if (new->id > pi->id)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+	return 0;
+}
+
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+{
+	struct ceph_pg_pool_info *pi;
+	struct rb_node *n = root->rb_node;
+
+	while (n) {
+		pi = rb_entry(n, struct ceph_pg_pool_info, node);
+		if (id < pi->id)
+			n = n->rb_left;
+		else if (id > pi->id)
+			n = n->rb_right;
+		else
+			return pi;
+	}
+	return NULL;
+}
+
+int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
+{
+	struct rb_node *rbp;
+
+	for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
+		struct ceph_pg_pool_info *pi =
+			rb_entry(rbp, struct ceph_pg_pool_info, node);
+		if (pi->name && strcmp(pi->name, name) == 0)
+			return pi->id;
+	}
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ceph_pg_poolid_by_name);
+
+static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
+{
+	rb_erase(&pi->node, root);
+	kfree(pi->name);
+	kfree(pi);
+}
+
+static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
+{
+	unsigned n, m;
+
+	ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+	calc_pg_masks(pi);
+
+	/* num_snaps * snap_info_t */
+	n = le32_to_cpu(pi->v.num_snaps);
+	while (n--) {
+		ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
+				 sizeof(struct ceph_timespec), bad);
+		*p += sizeof(u64) +       /* key */
+			1 + sizeof(u64) + /* u8, snapid */
+			sizeof(struct ceph_timespec);
+		m = ceph_decode_32(p);    /* snap name */
+		*p += m;
+	}
+
+	*p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+	return 0;
+
+bad:
+	return -EINVAL;
+}
+
+static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+{
+	struct ceph_pg_pool_info *pi;
+	u32 num, len, pool;
+
+	ceph_decode_32_safe(p, end, num, bad);
+	dout(" %d pool names\n", num);
+	while (num--) {
+		ceph_decode_32_safe(p, end, pool, bad);
+		ceph_decode_32_safe(p, end, len, bad);
+		dout("  pool %d len %d\n", pool, len);
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (pi) {
+			kfree(pi->name);
+			pi->name = kmalloc(len + 1, GFP_NOFS);
+			if (pi->name) {
+				memcpy(pi->name, *p, len);
+				pi->name[len] = '\0';
+				dout("  name is %s\n", pi->name);
+			}
+		}
+		*p += len;
+	}
+	return 0;
+
+bad:
+	return -EINVAL;
+}
+
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+	dout("osdmap_destroy %p\n", map);
+	if (map->crush)
+		crush_destroy(map->crush);
+	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+		struct ceph_pg_mapping *pg =
+			rb_entry(rb_first(&map->pg_temp),
+				 struct ceph_pg_mapping, node);
+		rb_erase(&pg->node, &map->pg_temp);
+		kfree(pg);
+	}
+	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+		struct ceph_pg_pool_info *pi =
+			rb_entry(rb_first(&map->pg_pools),
+				 struct ceph_pg_pool_info, node);
+		__remove_pg_pool(&map->pg_pools, pi);
+	}
+	kfree(map->osd_state);
+	kfree(map->osd_weight);
+	kfree(map->osd_addr);
+	kfree(map);
+}
+
+/*
+ * adjust max osd value.  reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+	u8 *state;
+	struct ceph_entity_addr *addr;
+	u32 *weight;
+
+	state = kcalloc(max, sizeof(*state), GFP_NOFS);
+	addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+	weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+	if (state == NULL || addr == NULL || weight == NULL) {
+		kfree(state);
+		kfree(addr);
+		kfree(weight);
+		return -ENOMEM;
+	}
+
+	/* copy old? */
+	if (map->osd_state) {
+		memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+		memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+		memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+		kfree(map->osd_state);
+		kfree(map->osd_addr);
+		kfree(map->osd_weight);
+	}
+
+	map->osd_state = state;
+	map->osd_weight = weight;
+	map->osd_addr = addr;
+	map->max_osd = max;
+	return 0;
+}
+
+/*
+ * decode a full map.
+ */
+struct ceph_osdmap *osdmap_decode(void **p, void *end)
+{
+	struct ceph_osdmap *map;
+	u16 version;
+	u32 len, max, i;
+	u8 ev;
+	int err = -EINVAL;
+	void *start = *p;
+	struct ceph_pg_pool_info *pi;
+
+	dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+	map = kzalloc(sizeof(*map), GFP_NOFS);
+	if (map == NULL)
+		return ERR_PTR(-ENOMEM);
+	map->pg_temp = RB_ROOT;
+
+	ceph_decode_16_safe(p, end, version, bad);
+	if (version > CEPH_OSDMAP_VERSION) {
+		pr_warning("got unknown v %d > %d of osdmap\n", version,
+			   CEPH_OSDMAP_VERSION);
+		goto bad;
+	}
+
+	ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
+	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
+	map->epoch = ceph_decode_32(p);
+	ceph_decode_copy(p, &map->created, sizeof(map->created));
+	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
+
+	ceph_decode_32_safe(p, end, max, bad);
+	while (max--) {
+		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+		pi = kzalloc(sizeof(*pi), GFP_NOFS);
+		if (!pi)
+			goto bad;
+		pi->id = ceph_decode_32(p);
+		ev = ceph_decode_8(p); /* encoding version */
+		if (ev > CEPH_PG_POOL_VERSION) {
+			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+				   ev, CEPH_PG_POOL_VERSION);
+			kfree(pi);
+			goto bad;
+		}
+		err = __decode_pool(p, end, pi);
+		if (err < 0)
+			goto bad;
+		__insert_pg_pool(&map->pg_pools, pi);
+	}
+
+	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+		goto bad;
+
+	ceph_decode_32_safe(p, end, map->pool_max, bad);
+
+	ceph_decode_32_safe(p, end, map->flags, bad);
+
+	max = ceph_decode_32(p);
+
+	/* (re)alloc osd arrays */
+	err = osdmap_set_max_osd(map, max);
+	if (err < 0)
+		goto bad;
+	dout("osdmap_decode max_osd = %d\n", map->max_osd);
+
+	/* osds */
+	err = -EINVAL;
+	ceph_decode_need(p, end, 3*sizeof(u32) +
+			 map->max_osd*(1 + sizeof(*map->osd_weight) +
+				       sizeof(*map->osd_addr)), bad);
+	*p += 4; /* skip length field (should match max) */
+	ceph_decode_copy(p, map->osd_state, map->max_osd);
+
+	*p += 4; /* skip length field (should match max) */
+	for (i = 0; i < map->max_osd; i++)
+		map->osd_weight[i] = ceph_decode_32(p);
+
+	*p += 4; /* skip length field (should match max) */
+	ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
+	for (i = 0; i < map->max_osd; i++)
+		ceph_decode_addr(&map->osd_addr[i]);
+
+	/* pg_temp */
+	ceph_decode_32_safe(p, end, len, bad);
+	for (i = 0; i < len; i++) {
+		int n, j;
+		struct ceph_pg pgid;
+		struct ceph_pg_mapping *pg;
+
+		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
+		ceph_decode_copy(p, &pgid, sizeof(pgid));
+		n = ceph_decode_32(p);
+		ceph_decode_need(p, end, n * sizeof(u32), bad);
+		err = -ENOMEM;
+		pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
+		if (!pg)
+			goto bad;
+		pg->pgid = pgid;
+		pg->len = n;
+		for (j = 0; j < n; j++)
+			pg->osds[j] = ceph_decode_32(p);
+
+		err = __insert_pg_mapping(pg, &map->pg_temp);
+		if (err)
+			goto bad;
+		dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
+	}
+
+	/* crush */
+	ceph_decode_32_safe(p, end, len, bad);
+	dout("osdmap_decode crush len %d from off 0x%x\n", len,
+	     (int)(*p - start));
+	ceph_decode_need(p, end, len, bad);
+	map->crush = crush_decode(*p, end);
+	*p += len;
+	if (IS_ERR(map->crush)) {
+		err = PTR_ERR(map->crush);
+		map->crush = NULL;
+		goto bad;
+	}
+
+	/* ignore the rest of the map */
+	*p = end;
+
+	dout("osdmap_decode done %p %p\n", *p, end);
+	return map;
+
+bad:
+	dout("osdmap_decode fail\n");
+	ceph_osdmap_destroy(map);
+	return ERR_PTR(err);
+}
+
+/*
+ * decode and apply an incremental map update.
+ */
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+					     struct ceph_osdmap *map,
+					     struct ceph_messenger *msgr)
+{
+	struct crush_map *newcrush = NULL;
+	struct ceph_fsid fsid;
+	u32 epoch = 0;
+	struct ceph_timespec modified;
+	u32 len, pool;
+	__s32 new_pool_max, new_flags, max;
+	void *start = *p;
+	int err = -EINVAL;
+	u16 version;
+	struct rb_node *rbp;
+
+	ceph_decode_16_safe(p, end, version, bad);
+	if (version > CEPH_OSDMAP_INC_VERSION) {
+		pr_warning("got unknown v %d > %d of inc osdmap\n", version,
+			   CEPH_OSDMAP_INC_VERSION);
+		goto bad;
+	}
+
+	ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
+			 bad);
+	ceph_decode_copy(p, &fsid, sizeof(fsid));
+	epoch = ceph_decode_32(p);
+	BUG_ON(epoch != map->epoch+1);
+	ceph_decode_copy(p, &modified, sizeof(modified));
+	new_pool_max = ceph_decode_32(p);
+	new_flags = ceph_decode_32(p);
+
+	/* full map? */
+	ceph_decode_32_safe(p, end, len, bad);
+	if (len > 0) {
+		dout("apply_incremental full map len %d, %p to %p\n",
+		     len, *p, end);
+		return osdmap_decode(p, min(*p+len, end));
+	}
+
+	/* new crush? */
+	ceph_decode_32_safe(p, end, len, bad);
+	if (len > 0) {
+		dout("apply_incremental new crush map len %d, %p to %p\n",
+		     len, *p, end);
+		newcrush = crush_decode(*p, min(*p+len, end));
+		if (IS_ERR(newcrush))
+			return ERR_CAST(newcrush);
+		*p += len;
+	}
+
+	/* new flags? */
+	if (new_flags >= 0)
+		map->flags = new_flags;
+	if (new_pool_max >= 0)
+		map->pool_max = new_pool_max;
+
+	ceph_decode_need(p, end, 5*sizeof(u32), bad);
+
+	/* new max? */
+	max = ceph_decode_32(p);
+	if (max >= 0) {
+		err = osdmap_set_max_osd(map, max);
+		if (err < 0)
+			goto bad;
+	}
+
+	map->epoch++;
+	map->modified = map->modified;
+	if (newcrush) {
+		if (map->crush)
+			crush_destroy(map->crush);
+		map->crush = newcrush;
+		newcrush = NULL;
+	}
+
+	/* new_pool */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		__u8 ev;
+		struct ceph_pg_pool_info *pi;
+
+		ceph_decode_32_safe(p, end, pool, bad);
+		ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
+		ev = ceph_decode_8(p);  /* encoding version */
+		if (ev > CEPH_PG_POOL_VERSION) {
+			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+				   ev, CEPH_PG_POOL_VERSION);
+			goto bad;
+		}
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (!pi) {
+			pi = kzalloc(sizeof(*pi), GFP_NOFS);
+			if (!pi) {
+				err = -ENOMEM;
+				goto bad;
+			}
+			pi->id = pool;
+			__insert_pg_pool(&map->pg_pools, pi);
+		}
+		err = __decode_pool(p, end, pi);
+		if (err < 0)
+			goto bad;
+	}
+	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+		goto bad;
+
+	/* old_pool */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		struct ceph_pg_pool_info *pi;
+
+		ceph_decode_32_safe(p, end, pool, bad);
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (pi)
+			__remove_pg_pool(&map->pg_pools, pi);
+	}
+
+	/* new_up */
+	err = -EINVAL;
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd;
+		struct ceph_entity_addr addr;
+		ceph_decode_32_safe(p, end, osd, bad);
+		ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
+		ceph_decode_addr(&addr);
+		pr_info("osd%d up\n", osd);
+		BUG_ON(osd >= map->max_osd);
+		map->osd_state[osd] |= CEPH_OSD_UP;
+		map->osd_addr[osd] = addr;
+	}
+
+	/* new_down */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd;
+		ceph_decode_32_safe(p, end, osd, bad);
+		(*p)++;  /* clean flag */
+		pr_info("osd%d down\n", osd);
+		if (osd < map->max_osd)
+			map->osd_state[osd] &= ~CEPH_OSD_UP;
+	}
+
+	/* new_weight */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd, off;
+		ceph_decode_need(p, end, sizeof(u32)*2, bad);
+		osd = ceph_decode_32(p);
+		off = ceph_decode_32(p);
+		pr_info("osd%d weight 0x%x %s\n", osd, off,
+		     off == CEPH_OSD_IN ? "(in)" :
+		     (off == CEPH_OSD_OUT ? "(out)" : ""));
+		if (osd < map->max_osd)
+			map->osd_weight[osd] = off;
+	}
+
+	/* new_pg_temp */
+	rbp = rb_first(&map->pg_temp);
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		struct ceph_pg_mapping *pg;
+		int j;
+		struct ceph_pg pgid;
+		u32 pglen;
+		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+		ceph_decode_copy(p, &pgid, sizeof(pgid));
+		pglen = ceph_decode_32(p);
+
+		/* remove any? */
+		while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
+						node)->pgid, pgid) <= 0) {
+			struct ceph_pg_mapping *cur =
+				rb_entry(rbp, struct ceph_pg_mapping, node);
+
+			rbp = rb_next(rbp);
+			dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
+			rb_erase(&cur->node, &map->pg_temp);
+			kfree(cur);
+		}
+
+		if (pglen) {
+			/* insert */
+			ceph_decode_need(p, end, pglen*sizeof(u32), bad);
+			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
+			if (!pg) {
+				err = -ENOMEM;
+				goto bad;
+			}
+			pg->pgid = pgid;
+			pg->len = pglen;
+			for (j = 0; j < pglen; j++)
+				pg->osds[j] = ceph_decode_32(p);
+			err = __insert_pg_mapping(pg, &map->pg_temp);
+			if (err) {
+				kfree(pg);
+				goto bad;
+			}
+			dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
+			     pglen);
+		}
+	}
+	while (rbp) {
+		struct ceph_pg_mapping *cur =
+			rb_entry(rbp, struct ceph_pg_mapping, node);
+
+		rbp = rb_next(rbp);
+		dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
+		rb_erase(&cur->node, &map->pg_temp);
+		kfree(cur);
+	}
+
+	/* ignore the rest */
+	*p = end;
+	return map;
+
+bad:
+	pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
+	       epoch, (int)(*p - start), *p, start, end);
+	print_hex_dump(KERN_DEBUG, "osdmap: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       start, end - start, true);
+	if (newcrush)
+		crush_destroy(newcrush);
+	return ERR_PTR(err);
+}
+
+
+
+
+/*
+ * calculate file layout from given offset, length.
+ * fill in correct oid, logical length, and object extent
+ * offset, length.
+ *
+ * for now, we write only a single su, until we can
+ * pass a stride back to the caller.
+ */
+void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+				   u64 off, u64 *plen,
+				   u64 *ono,
+				   u64 *oxoff, u64 *oxlen)
+{
+	u32 osize = le32_to_cpu(layout->fl_object_size);
+	u32 su = le32_to_cpu(layout->fl_stripe_unit);
+	u32 sc = le32_to_cpu(layout->fl_stripe_count);
+	u32 bl, stripeno, stripepos, objsetno;
+	u32 su_per_object;
+	u64 t, su_offset;
+
+	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
+	     osize, su);
+	su_per_object = osize / su;
+	dout("osize %u / su %u = su_per_object %u\n", osize, su,
+	     su_per_object);
+
+	BUG_ON((su & ~PAGE_MASK) != 0);
+	/* bl = *off / su; */
+	t = off;
+	do_div(t, su);
+	bl = t;
+	dout("off %llu / su %u = bl %u\n", off, su, bl);
+
+	stripeno = bl / sc;
+	stripepos = bl % sc;
+	objsetno = stripeno / su_per_object;
+
+	*ono = objsetno * sc + stripepos;
+	dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
+
+	/* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
+	t = off;
+	su_offset = do_div(t, su);
+	*oxoff = su_offset + (stripeno % su_per_object) * su;
+
+	/*
+	 * Calculate the length of the extent being written to the selected
+	 * object. This is the minimum of the full length requested (plen) or
+	 * the remainder of the current stripe being written to.
+	 */
+	*oxlen = min_t(u64, *plen, su - su_offset);
+	*plen = *oxlen;
+
+	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
+}
+EXPORT_SYMBOL(ceph_calc_file_object_mapping);
+
+/*
+ * calculate an object layout (i.e. pgid) from an oid,
+ * file_layout, and osdmap
+ */
+int ceph_calc_object_layout(struct ceph_object_layout *ol,
+			    const char *oid,
+			    struct ceph_file_layout *fl,
+			    struct ceph_osdmap *osdmap)
+{
+	unsigned num, num_mask;
+	struct ceph_pg pgid;
+	s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
+	int poolid = le32_to_cpu(fl->fl_pg_pool);
+	struct ceph_pg_pool_info *pool;
+	unsigned ps;
+
+	BUG_ON(!osdmap);
+
+	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+	if (!pool)
+		return -EIO;
+	ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
+	if (preferred >= 0) {
+		ps += preferred;
+		num = le32_to_cpu(pool->v.lpg_num);
+		num_mask = pool->lpg_num_mask;
+	} else {
+		num = le32_to_cpu(pool->v.pg_num);
+		num_mask = pool->pg_num_mask;
+	}
+
+	pgid.ps = cpu_to_le16(ps);
+	pgid.preferred = cpu_to_le16(preferred);
+	pgid.pool = fl->fl_pg_pool;
+	if (preferred >= 0)
+		dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
+		     (int)preferred);
+	else
+		dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
+
+	ol->ol_pgid = pgid;
+	ol->ol_stripe_unit = fl->fl_object_stripe_unit;
+	return 0;
+}
+EXPORT_SYMBOL(ceph_calc_object_layout);
+
+/*
+ * Calculate raw osd vector for the given pgid.  Return pointer to osd
+ * array, or NULL on failure.
+ */
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+			int *osds, int *num)
+{
+	struct ceph_pg_mapping *pg;
+	struct ceph_pg_pool_info *pool;
+	int ruleno;
+	unsigned poolid, ps, pps;
+	int preferred;
+
+	/* pg_temp? */
+	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+	if (pg) {
+		*num = pg->len;
+		return pg->osds;
+	}
+
+	/* crush */
+	poolid = le32_to_cpu(pgid.pool);
+	ps = le16_to_cpu(pgid.ps);
+	preferred = (s16)le16_to_cpu(pgid.preferred);
+
+	/* don't forcefeed bad device ids to crush */
+	if (preferred >= osdmap->max_osd ||
+	    preferred >= osdmap->crush->max_devices)
+		preferred = -1;
+
+	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+	if (!pool)
+		return NULL;
+	ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
+				 pool->v.type, pool->v.size);
+	if (ruleno < 0) {
+		pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
+		       poolid, pool->v.crush_ruleset, pool->v.type,
+		       pool->v.size);
+		return NULL;
+	}
+
+	if (preferred >= 0)
+		pps = ceph_stable_mod(ps,
+				      le32_to_cpu(pool->v.lpgp_num),
+				      pool->lpgp_num_mask);
+	else
+		pps = ceph_stable_mod(ps,
+				      le32_to_cpu(pool->v.pgp_num),
+				      pool->pgp_num_mask);
+	pps += poolid;
+	*num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
+			     min_t(int, pool->v.size, *num),
+			     preferred, osdmap->osd_weight);
+	return osds;
+}
+
+/*
+ * Return acting set for given pgid.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+			int *acting)
+{
+	int rawosds[CEPH_PG_MAX_SIZE], *osds;
+	int i, o, num = CEPH_PG_MAX_SIZE;
+
+	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+	if (!osds)
+		return -1;
+
+	/* primary is first up osd */
+	o = 0;
+	for (i = 0; i < num; i++)
+		if (ceph_osd_is_up(osdmap, osds[i]))
+			acting[o++] = osds[i];
+	return o;
+}
+
+/*
+ * Return primary osd for given pgid, or -1 if none.
+ */
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+{
+	int rawosds[CEPH_PG_MAX_SIZE], *osds;
+	int i, num = CEPH_PG_MAX_SIZE;
+
+	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+	if (!osds)
+		return -1;
+
+	/* primary is first up osd */
+	for (i = 0; i < num; i++)
+		if (ceph_osd_is_up(osdmap, osds[i]))
+			return osds[i];
+	return -1;
+}
+EXPORT_SYMBOL(ceph_calc_pg_primary);
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
new file mode 100644
index 00000000000..13cb409a7bb
--- /dev/null
+++ b/net/ceph/pagelist.c
@@ -0,0 +1,154 @@
+
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/ceph/pagelist.h>
+
+static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
+{
+	if (pl->mapped_tail) {
+		struct page *page = list_entry(pl->head.prev, struct page, lru);
+		kunmap(page);
+		pl->mapped_tail = NULL;
+	}
+}
+
+int ceph_pagelist_release(struct ceph_pagelist *pl)
+{
+	ceph_pagelist_unmap_tail(pl);
+	while (!list_empty(&pl->head)) {
+		struct page *page = list_first_entry(&pl->head, struct page,
+						     lru);
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	ceph_pagelist_free_reserve(pl);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_release);
+
+static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
+{
+	struct page *page;
+
+	if (!pl->num_pages_free) {
+		page = __page_cache_alloc(GFP_NOFS);
+	} else {
+		page = list_first_entry(&pl->free_list, struct page, lru);
+		list_del(&page->lru);
+		--pl->num_pages_free;
+	}
+	if (!page)
+		return -ENOMEM;
+	pl->room += PAGE_SIZE;
+	ceph_pagelist_unmap_tail(pl);
+	list_add_tail(&page->lru, &pl->head);
+	pl->mapped_tail = kmap(page);
+	return 0;
+}
+
+int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
+{
+	while (pl->room < len) {
+		size_t bit = pl->room;
+		int ret;
+
+		memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
+		       buf, bit);
+		pl->length += bit;
+		pl->room -= bit;
+		buf += bit;
+		len -= bit;
+		ret = ceph_pagelist_addpage(pl);
+		if (ret)
+			return ret;
+	}
+
+	memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
+	pl->length += len;
+	pl->room -= len;
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_append);
+
+/**
+ * Allocate enough pages for a pagelist to append the given amount
+ * of data without without allocating.
+ * Returns: 0 on success, -ENOMEM on error.
+ */
+int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)
+{
+	if (space <= pl->room)
+		return 0;
+	space -= pl->room;
+	space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT;   /* conv to num pages */
+
+	while (space > pl->num_pages_free) {
+		struct page *page = __page_cache_alloc(GFP_NOFS);
+		if (!page)
+			return -ENOMEM;
+		list_add_tail(&page->lru, &pl->free_list);
+		++pl->num_pages_free;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_reserve);
+
+/**
+ * Free any pages that have been preallocated.
+ */
+int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
+{
+	while (!list_empty(&pl->free_list)) {
+		struct page *page = list_first_entry(&pl->free_list,
+						     struct page, lru);
+		list_del(&page->lru);
+		__free_page(page);
+		--pl->num_pages_free;
+	}
+	BUG_ON(pl->num_pages_free);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_free_reserve);
+
+/**
+ * Create a truncation point.
+ */
+void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
+			      struct ceph_pagelist_cursor *c)
+{
+	c->pl = pl;
+	c->page_lru = pl->head.prev;
+	c->room = pl->room;
+}
+EXPORT_SYMBOL(ceph_pagelist_set_cursor);
+
+/**
+ * Truncate a pagelist to the given point. Move extra pages to reserve.
+ * This won't sleep.
+ * Returns: 0 on success,
+ *          -EINVAL if the pagelist doesn't match the trunc point pagelist
+ */
+int ceph_pagelist_truncate(struct ceph_pagelist *pl,
+			   struct ceph_pagelist_cursor *c)
+{
+	struct page *page;
+
+	if (pl != c->pl)
+		return -EINVAL;
+	ceph_pagelist_unmap_tail(pl);
+	while (pl->head.prev != c->page_lru) {
+		page = list_entry(pl->head.prev, struct page, lru);
+		list_del(&page->lru);                /* remove from pagelist */
+		list_add_tail(&page->lru, &pl->free_list); /* add to reserve */
+		++pl->num_pages_free;
+	}
+	pl->room = c->room;
+	if (!list_empty(&pl->head)) {
+		page = list_entry(pl->head.prev, struct page, lru);
+		pl->mapped_tail = kmap(page);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_truncate);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
new file mode 100644
index 00000000000..54caf068715
--- /dev/null
+++ b/net/ceph/pagevec.c
@@ -0,0 +1,223 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+
+#include <linux/ceph/libceph.h>
+
+/*
+ * build a vector of user pages
+ */
+struct page **ceph_get_direct_page_vector(const char __user *data,
+						 int num_pages,
+						 loff_t off, size_t len)
+{
+	struct page **pages;
+	int rc;
+
+	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	down_read(&current->mm->mmap_sem);
+	rc = get_user_pages(current, current->mm, (unsigned long)data,
+			    num_pages, 0, 0, pages, NULL);
+	up_read(&current->mm->mmap_sem);
+	if (rc < 0)
+		goto fail;
+	return pages;
+
+fail:
+	kfree(pages);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(ceph_get_direct_page_vector);
+
+void ceph_put_page_vector(struct page **pages, int num_pages)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++)
+		put_page(pages[i]);
+	kfree(pages);
+}
+EXPORT_SYMBOL(ceph_put_page_vector);
+
+void ceph_release_page_vector(struct page **pages, int num_pages)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++)
+		__free_pages(pages[i], 0);
+	kfree(pages);
+}
+EXPORT_SYMBOL(ceph_release_page_vector);
+
+/*
+ * allocate a vector new pages
+ */
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
+{
+	struct page **pages;
+	int i;
+
+	pages = kmalloc(sizeof(*pages) * num_pages, flags);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = __page_cache_alloc(flags);
+		if (pages[i] == NULL) {
+			ceph_release_page_vector(pages, i);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+	return pages;
+}
+EXPORT_SYMBOL(ceph_alloc_page_vector);
+
+/*
+ * copy user data into a page vector
+ */
+int ceph_copy_user_to_page_vector(struct page **pages,
+					 const char __user *data,
+					 loff_t off, size_t len)
+{
+	int i = 0;
+	int po = off & ~PAGE_CACHE_MASK;
+	int left = len;
+	int l, bad;
+
+	while (left > 0) {
+		l = min_t(int, PAGE_CACHE_SIZE-po, left);
+		bad = copy_from_user(page_address(pages[i]) + po, data, l);
+		if (bad == l)
+			return -EFAULT;
+		data += l - bad;
+		left -= l - bad;
+		po += l - bad;
+		if (po == PAGE_CACHE_SIZE) {
+			po = 0;
+			i++;
+		}
+	}
+	return len;
+}
+EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
+
+int ceph_copy_to_page_vector(struct page **pages,
+				    const char *data,
+				    loff_t off, size_t len)
+{
+	int i = 0;
+	size_t po = off & ~PAGE_CACHE_MASK;
+	size_t left = len;
+	size_t l;
+
+	while (left > 0) {
+		l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+		memcpy(page_address(pages[i]) + po, data, l);
+		data += l;
+		left -= l;
+		po += l;
+		if (po == PAGE_CACHE_SIZE) {
+			po = 0;
+			i++;
+		}
+	}
+	return len;
+}
+EXPORT_SYMBOL(ceph_copy_to_page_vector);
+
+int ceph_copy_from_page_vector(struct page **pages,
+				    char *data,
+				    loff_t off, size_t len)
+{
+	int i = 0;
+	size_t po = off & ~PAGE_CACHE_MASK;
+	size_t left = len;
+	size_t l;
+
+	while (left > 0) {
+		l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+		memcpy(data, page_address(pages[i]) + po, l);
+		data += l;
+		left -= l;
+		po += l;
+		if (po == PAGE_CACHE_SIZE) {
+			po = 0;
+			i++;
+		}
+	}
+	return len;
+}
+EXPORT_SYMBOL(ceph_copy_from_page_vector);
+
+/*
+ * copy user data from a page vector into a user pointer
+ */
+int ceph_copy_page_vector_to_user(struct page **pages,
+					 char __user *data,
+					 loff_t off, size_t len)
+{
+	int i = 0;
+	int po = off & ~PAGE_CACHE_MASK;
+	int left = len;
+	int l, bad;
+
+	while (left > 0) {
+		l = min_t(int, left, PAGE_CACHE_SIZE-po);
+		bad = copy_to_user(data, page_address(pages[i]) + po, l);
+		if (bad == l)
+			return -EFAULT;
+		data += l - bad;
+		left -= l - bad;
+		if (po) {
+			po += l - bad;
+			if (po == PAGE_CACHE_SIZE)
+				po = 0;
+		}
+		i++;
+	}
+	return len;
+}
+EXPORT_SYMBOL(ceph_copy_page_vector_to_user);
+
+/*
+ * Zero an extent within a page vector.  Offset is relative to the
+ * start of the first page.
+ */
+void ceph_zero_page_vector_range(int off, int len, struct page **pages)
+{
+	int i = off >> PAGE_CACHE_SHIFT;
+
+	off &= ~PAGE_CACHE_MASK;
+
+	dout("zero_page_vector_page %u~%u\n", off, len);
+
+	/* leading partial page? */
+	if (off) {
+		int end = min((int)PAGE_CACHE_SIZE, off + len);
+		dout("zeroing %d %p head from %d\n", i, pages[i],
+		     (int)off);
+		zero_user_segment(pages[i], off, end);
+		len -= (end - off);
+		i++;
+	}
+	while (len >= PAGE_CACHE_SIZE) {
+		dout("zeroing %d %p len=%d\n", i, pages[i], len);
+		zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+		len -= PAGE_CACHE_SIZE;
+		i++;
+	}
+	/* trailing partial page? */
+	if (len) {
+		dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
+		zero_user_segment(pages[i], 0, len);
+	}
+}
+EXPORT_SYMBOL(ceph_zero_page_vector_range);
+
diff --git a/net/compat.c b/net/compat.c
index ec24d9edb02..3649d589536 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -41,10 +41,12 @@ static inline int iov_from_user_compat_to_kern(struct iovec *kiov,
 		compat_size_t len;
 
 		if (get_user(len, &uiov32->iov_len) ||
-		   get_user(buf, &uiov32->iov_base)) {
-			tot_len = -EFAULT;
-			break;
-		}
+		    get_user(buf, &uiov32->iov_base))
+			return -EFAULT;
+
+		if (len > INT_MAX - tot_len)
+			len = INT_MAX - tot_len;
+
 		tot_len += len;
 		kiov->iov_base = compat_ptr(buf);
 		kiov->iov_len = (__kernel_size_t) len;
@@ -81,7 +83,7 @@ int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov,
 	int tot_len;
 
 	if (kern_msg->msg_namelen) {
-		if (mode==VERIFY_READ) {
+		if (mode == VERIFY_READ) {
 			int err = move_addr_to_kernel(kern_msg->msg_name,
 						      kern_msg->msg_namelen,
 						      kern_address);
@@ -354,7 +356,7 @@ static int do_set_attach_filter(struct socket *sock, int level, int optname,
 static int do_set_sock_timeout(struct socket *sock, int level,
 		int optname, char __user *optval, unsigned int optlen)
 {
-	struct compat_timeval __user *up = (struct compat_timeval __user *) optval;
+	struct compat_timeval __user *up = (struct compat_timeval __user *)optval;
 	struct timeval ktime;
 	mm_segment_t old_fs;
 	int err;
@@ -367,7 +369,7 @@ static int do_set_sock_timeout(struct socket *sock, int level,
 		return -EFAULT;
 	old_fs = get_fs();
 	set_fs(KERNEL_DS);
-	err = sock_setsockopt(sock, level, optname, (char *) &ktime, sizeof(ktime));
+	err = sock_setsockopt(sock, level, optname, (char *)&ktime, sizeof(ktime));
 	set_fs(old_fs);
 
 	return err;
@@ -389,11 +391,10 @@ asmlinkage long compat_sys_setsockopt(int fd, int level, int optname,
 				char __user *optval, unsigned int optlen)
 {
 	int err;
-	struct socket *sock;
+	struct socket *sock = sockfd_lookup(fd, &err);
 
-	if ((sock = sockfd_lookup(fd, &err))!=NULL)
-	{
-		err = security_socket_setsockopt(sock,level,optname);
+	if (sock) {
+		err = security_socket_setsockopt(sock, level, optname);
 		if (err) {
 			sockfd_put(sock);
 			return err;
@@ -453,7 +454,7 @@ static int compat_sock_getsockopt(struct socket *sock, int level, int optname,
 int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
 {
 	struct compat_timeval __user *ctv =
-			(struct compat_timeval __user*) userstamp;
+			(struct compat_timeval __user *) userstamp;
 	int err = -ENOENT;
 	struct timeval tv;
 
@@ -477,7 +478,7 @@ EXPORT_SYMBOL(compat_sock_get_timestamp);
 int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
 {
 	struct compat_timespec __user *ctv =
-			(struct compat_timespec __user*) userstamp;
+			(struct compat_timespec __user *) userstamp;
 	int err = -ENOENT;
 	struct timespec ts;
 
@@ -502,12 +503,10 @@ asmlinkage long compat_sys_getsockopt(int fd, int level, int optname,
 				char __user *optval, int __user *optlen)
 {
 	int err;
-	struct socket *sock;
+	struct socket *sock = sockfd_lookup(fd, &err);
 
-	if ((sock = sockfd_lookup(fd, &err))!=NULL)
-	{
-		err = security_socket_getsockopt(sock, level,
-							   optname);
+	if (sock) {
+		err = security_socket_getsockopt(sock, level, optname);
 		if (err) {
 			sockfd_put(sock);
 			return err;
@@ -531,7 +530,7 @@ struct compat_group_req {
 	__u32				 gr_interface;
 	struct __kernel_sockaddr_storage gr_group
 		__attribute__ ((aligned(4)));
-} __attribute__ ((packed));
+} __packed;
 
 struct compat_group_source_req {
 	__u32				 gsr_interface;
@@ -539,7 +538,7 @@ struct compat_group_source_req {
 		__attribute__ ((aligned(4)));
 	struct __kernel_sockaddr_storage gsr_source
 		__attribute__ ((aligned(4)));
-} __attribute__ ((packed));
+} __packed;
 
 struct compat_group_filter {
 	__u32				 gf_interface;
@@ -549,7 +548,7 @@ struct compat_group_filter {
 	__u32				 gf_numsrc;
 	struct __kernel_sockaddr_storage gf_slist[1]
 		__attribute__ ((aligned(4)));
-} __attribute__ ((packed));
+} __packed;
 
 #define __COMPAT_GF0_SIZE (sizeof(struct compat_group_filter) - \
 			sizeof(struct __kernel_sockaddr_storage))
@@ -557,7 +556,7 @@ struct compat_group_filter {
 
 int compat_mc_setsockopt(struct sock *sock, int level, int optname,
 	char __user *optval, unsigned int optlen,
-	int (*setsockopt)(struct sock *,int,int,char __user *,unsigned int))
+	int (*setsockopt)(struct sock *, int, int, char __user *, unsigned int))
 {
 	char __user	*koptval = optval;
 	int		koptlen = optlen;
@@ -640,12 +639,11 @@ int compat_mc_setsockopt(struct sock *sock, int level, int optname,
 	}
 	return setsockopt(sock, level, optname, koptval, koptlen);
 }
-
 EXPORT_SYMBOL(compat_mc_setsockopt);
 
 int compat_mc_getsockopt(struct sock *sock, int level, int optname,
 	char __user *optval, int __user *optlen,
-	int (*getsockopt)(struct sock *,int,int,char __user *,int __user *))
+	int (*getsockopt)(struct sock *, int, int, char __user *, int __user *))
 {
 	struct compat_group_filter __user *gf32 = (void *)optval;
 	struct group_filter __user *kgf;
@@ -681,7 +679,7 @@ int compat_mc_getsockopt(struct sock *sock, int level, int optname,
 	    __put_user(interface, &kgf->gf_interface) ||
 	    __put_user(fmode, &kgf->gf_fmode) ||
 	    __put_user(numsrc, &kgf->gf_numsrc) ||
-	    copy_in_user(&kgf->gf_group,&gf32->gf_group,sizeof(kgf->gf_group)))
+	    copy_in_user(&kgf->gf_group, &gf32->gf_group, sizeof(kgf->gf_group)))
 		return -EFAULT;
 
 	err = getsockopt(sock, level, optname, (char __user *)kgf, koptlen);
@@ -714,21 +712,22 @@ int compat_mc_getsockopt(struct sock *sock, int level, int optname,
 		copylen = numsrc * sizeof(gf32->gf_slist[0]);
 		if (copylen > klen)
 			copylen = klen;
-	        if (copy_in_user(gf32->gf_slist, kgf->gf_slist, copylen))
+		if (copy_in_user(gf32->gf_slist, kgf->gf_slist, copylen))
 			return -EFAULT;
 	}
 	return err;
 }
-
 EXPORT_SYMBOL(compat_mc_getsockopt);
 
 
 /* Argument list sizes for compat_sys_socketcall */
 #define AL(x) ((x) * sizeof(u32))
-static unsigned char nas[20]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
-				AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
-				AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
-				AL(4),AL(5)};
+static unsigned char nas[20] = {
+	AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
+	AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
+	AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
+	AL(4), AL(5)
+};
 #undef AL
 
 asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags)
@@ -827,7 +826,7 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 					  compat_ptr(a[4]), compat_ptr(a[5]));
 		break;
 	case SYS_SHUTDOWN:
-		ret = sys_shutdown(a0,a1);
+		ret = sys_shutdown(a0, a1);
 		break;
 	case SYS_SETSOCKOPT:
 		ret = compat_sys_setsockopt(a0, a1, a[2],
diff --git a/net/core/Makefile b/net/core/Makefile
index 51c3eec850e..8a04dd22cf7 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -18,4 +18,4 @@ obj-$(CONFIG_NET_DMA) += user_dma.o
 obj-$(CONFIG_FIB_RULES) += fib_rules.o
 obj-$(CONFIG_TRACEPOINTS) += net-traces.o
 obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
-
+obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index f5b6f43a4c2..cd1e039c875 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -219,6 +219,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
 	return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
 				   &peeked, err);
 }
+EXPORT_SYMBOL(skb_recv_datagram);
 
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
 {
@@ -242,6 +243,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
 	unlock_sock_fast(sk, slow);
 
 	/* skb is now orphaned, can be freed outside of locked section */
+	trace_kfree_skb(skb, skb_free_datagram_locked);
 	__kfree_skb(skb);
 }
 EXPORT_SYMBOL(skb_free_datagram_locked);
@@ -288,7 +290,6 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
 
 	return err;
 }
-
 EXPORT_SYMBOL(skb_kill_datagram);
 
 /**
@@ -373,6 +374,7 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
 fault:
 	return -EFAULT;
 }
+EXPORT_SYMBOL(skb_copy_datagram_iovec);
 
 /**
  *	skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
@@ -716,6 +718,7 @@ csum_error:
 fault:
 	return -EFAULT;
 }
+EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
 
 /**
  * 	datagram_poll - generic datagram poll
@@ -744,13 +747,12 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
 		mask |= POLLERR;
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLRDHUP;
+		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
 		mask |= POLLHUP;
 
 	/* readable? */
-	if (!skb_queue_empty(&sk->sk_receive_queue) ||
-	    (sk->sk_shutdown & RCV_SHUTDOWN))
+	if (!skb_queue_empty(&sk->sk_receive_queue))
 		mask |= POLLIN | POLLRDNORM;
 
 	/* Connection-based need to check for termination and startup */
@@ -770,8 +772,4 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
 
 	return mask;
 }
-
 EXPORT_SYMBOL(datagram_poll);
-EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
-EXPORT_SYMBOL(skb_copy_datagram_iovec);
-EXPORT_SYMBOL(skb_recv_datagram);
diff --git a/net/core/dev.c b/net/core/dev.c
index 1f466e82ac3..35dfb831848 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -101,8 +101,6 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/stat.h>
-#include <linux/if_bridge.h>
-#include <linux/if_macvlan.h>
 #include <net/dst.h>
 #include <net/pkt_sched.h>
 #include <net/checksum.h>
@@ -130,7 +128,10 @@
 #include <linux/jhash.h>
 #include <linux/random.h>
 #include <trace/events/napi.h>
+#include <trace/events/net.h>
+#include <trace/events/skb.h>
 #include <linux/pci.h>
+#include <linux/inetdevice.h>
 
 #include "net-sysfs.h"
 
@@ -373,6 +374,14 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
  *							--ANK (980803)
  */
 
+static inline struct list_head *ptype_head(const struct packet_type *pt)
+{
+	if (pt->type == htons(ETH_P_ALL))
+		return &ptype_all;
+	else
+		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
+}
+
 /**
  *	dev_add_pack - add packet handler
  *	@pt: packet type declaration
@@ -388,16 +397,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 
 void dev_add_pack(struct packet_type *pt)
 {
-	int hash;
+	struct list_head *head = ptype_head(pt);
 
-	spin_lock_bh(&ptype_lock);
-	if (pt->type == htons(ETH_P_ALL))
-		list_add_rcu(&pt->list, &ptype_all);
-	else {
-		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
-		list_add_rcu(&pt->list, &ptype_base[hash]);
-	}
-	spin_unlock_bh(&ptype_lock);
+	spin_lock(&ptype_lock);
+	list_add_rcu(&pt->list, head);
+	spin_unlock(&ptype_lock);
 }
 EXPORT_SYMBOL(dev_add_pack);
 
@@ -416,15 +420,10 @@ EXPORT_SYMBOL(dev_add_pack);
  */
 void __dev_remove_pack(struct packet_type *pt)
 {
-	struct list_head *head;
+	struct list_head *head = ptype_head(pt);
 	struct packet_type *pt1;
 
-	spin_lock_bh(&ptype_lock);
-
-	if (pt->type == htons(ETH_P_ALL))
-		head = &ptype_all;
-	else
-		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
+	spin_lock(&ptype_lock);
 
 	list_for_each_entry(pt1, head, list) {
 		if (pt == pt1) {
@@ -435,7 +434,7 @@ void __dev_remove_pack(struct packet_type *pt)
 
 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 out:
-	spin_unlock_bh(&ptype_lock);
+	spin_unlock(&ptype_lock);
 }
 EXPORT_SYMBOL(__dev_remove_pack);
 
@@ -803,35 +802,31 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 
 /**
- *	dev_get_by_flags - find any device with given flags
+ *	dev_get_by_flags_rcu - find any device with given flags
  *	@net: the applicable net namespace
  *	@if_flags: IFF_* values
  *	@mask: bitmask of bits in if_flags to check
  *
  *	Search for any interface with the given flags. Returns NULL if a device
- *	is not found or a pointer to the device. The device returned has
- *	had a reference added and the pointer is safe until the user calls
- *	dev_put to indicate they have finished with it.
+ *	is not found or a pointer to the device. Must be called inside
+ *	rcu_read_lock(), and result refcount is unchanged.
  */
 
-struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
+struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 				    unsigned short mask)
 {
 	struct net_device *dev, *ret;
 
 	ret = NULL;
-	rcu_read_lock();
 	for_each_netdev_rcu(net, dev) {
 		if (((dev->flags ^ if_flags) & mask) == 0) {
-			dev_hold(dev);
 			ret = dev;
 			break;
 		}
 	}
-	rcu_read_unlock();
 	return ret;
 }
-EXPORT_SYMBOL(dev_get_by_flags);
+EXPORT_SYMBOL(dev_get_by_flags_rcu);
 
 /**
  *	dev_valid_name - check if name is okay for network device
@@ -1490,8 +1485,9 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 	skb_orphan(skb);
 	nf_reset(skb);
 
-	if (!(dev->flags & IFF_UP) ||
-	    (skb->len > (dev->mtu + dev->hard_header_len))) {
+	if (unlikely(!(dev->flags & IFF_UP) ||
+		     (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
+		atomic_long_inc(&dev->rx_dropped);
 		kfree_skb(skb);
 		return NET_RX_DROP;
 	}
@@ -1542,7 +1538,8 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 				if (net_ratelimit())
 					printk(KERN_CRIT "protocol %04x is "
 					       "buggy, dev %s\n",
-					       skb2->protocol, dev->name);
+					       ntohs(skb2->protocol),
+					       dev->name);
 				skb_reset_network_header(skb2);
 			}
 
@@ -1558,21 +1555,56 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
  */
-void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
+int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 {
-	unsigned int real_num = dev->real_num_tx_queues;
+	if (txq < 1 || txq > dev->num_tx_queues)
+		return -EINVAL;
+
+	if (dev->reg_state == NETREG_REGISTERED) {
+		ASSERT_RTNL();
 
-	if (unlikely(txq > dev->num_tx_queues))
-		;
-	else if (txq > real_num)
-		dev->real_num_tx_queues = txq;
-	else if (txq < real_num) {
-		dev->real_num_tx_queues = txq;
-		qdisc_reset_all_tx_gt(dev, txq);
+		if (txq < dev->real_num_tx_queues)
+			qdisc_reset_all_tx_gt(dev, txq);
 	}
+
+	dev->real_num_tx_queues = txq;
+	return 0;
 }
 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
 
+#ifdef CONFIG_RPS
+/**
+ *	netif_set_real_num_rx_queues - set actual number of RX queues used
+ *	@dev: Network device
+ *	@rxq: Actual number of RX queues
+ *
+ *	This must be called either with the rtnl_lock held or before
+ *	registration of the net device.  Returns 0 on success, or a
+ *	negative error code.  If called before registration, it always
+ *	succeeds.
+ */
+int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
+{
+	int rc;
+
+	if (rxq < 1 || rxq > dev->num_rx_queues)
+		return -EINVAL;
+
+	if (dev->reg_state == NETREG_REGISTERED) {
+		ASSERT_RTNL();
+
+		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
+						  rxq);
+		if (rc)
+			return rc;
+	}
+
+	dev->real_num_rx_queues = rxq;
+	return 0;
+}
+EXPORT_SYMBOL(netif_set_real_num_rx_queues);
+#endif
+
 static inline void __netif_reschedule(struct Qdisc *q)
 {
 	struct softnet_data *sd;
@@ -1653,10 +1685,10 @@ EXPORT_SYMBOL(netif_device_attach);
 
 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
 {
-	return ((features & NETIF_F_GEN_CSUM) ||
-		((features & NETIF_F_IP_CSUM) &&
+	return ((features & NETIF_F_NO_CSUM) ||
+		((features & NETIF_F_V4_CSUM) &&
 		 protocol == htons(ETH_P_IP)) ||
-		((features & NETIF_F_IPV6_CSUM) &&
+		((features & NETIF_F_V6_CSUM) &&
 		 protocol == htons(ETH_P_IPV6)) ||
 		((features & NETIF_F_FCOE_CRC) &&
 		 protocol == htons(ETH_P_FCOE)));
@@ -1664,17 +1696,18 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol)
 
 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
 {
-	if (can_checksum_protocol(dev->features, skb->protocol))
-		return true;
+	__be16 protocol = skb->protocol;
+	int features = dev->features;
 
-	if (skb->protocol == htons(ETH_P_8021Q)) {
+	if (vlan_tx_tag_present(skb)) {
+		features &= dev->vlan_features;
+	} else if (protocol == htons(ETH_P_8021Q)) {
 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
-		if (can_checksum_protocol(dev->features & dev->vlan_features,
-					  veh->h_vlan_encapsulated_proto))
-			return true;
+		protocol = veh->h_vlan_encapsulated_proto;
+		features &= dev->vlan_features;
 	}
 
-	return false;
+	return can_checksum_protocol(features, protocol);
 }
 
 /**
@@ -1763,6 +1796,16 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
 	__be16 type = skb->protocol;
 	int err;
 
+	if (type == htons(ETH_P_8021Q)) {
+		struct vlan_ethhdr *veh;
+
+		if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
+			return ERR_PTR(-EINVAL);
+
+		veh = (struct vlan_ethhdr *)skb->data;
+		type = veh->h_vlan_encapsulated_proto;
+	}
+
 	skb_reset_mac_header(skb);
 	skb->mac_len = skb->network_header - skb->mac_header;
 	__skb_pull(skb, skb->mac_len);
@@ -1907,14 +1950,14 @@ static int dev_gso_segment(struct sk_buff *skb)
 
 /*
  * Try to orphan skb early, right before transmission by the device.
- * We cannot orphan skb if tx timestamp is requested, since
- * drivers need to call skb_tstamp_tx() to send the timestamp.
+ * We cannot orphan skb if tx timestamp is requested or the sk-reference
+ * is needed on driver level for other reasons, e.g. see net/can/raw.c
  */
 static inline void skb_orphan_try(struct sk_buff *skb)
 {
 	struct sock *sk = skb->sk;
 
-	if (sk && !skb_tx(skb)->flags) {
+	if (sk && !skb_shinfo(skb)->tx_flags) {
 		/* skb_tx_hash() wont be able to get sk.
 		 * We copy sk_hash into skb->rxhash
 		 */
@@ -1924,6 +1967,27 @@ static inline void skb_orphan_try(struct sk_buff *skb)
 	}
 }
 
+/*
+ * Returns true if either:
+ *	1. skb has frag_list and the device doesn't support FRAGLIST, or
+ *	2. skb is fragmented and the device does not support SG, or if
+ *	   at least one of fragments is in highmem and device does not
+ *	   support DMA from it.
+ */
+static inline int skb_needs_linearize(struct sk_buff *skb,
+				      struct net_device *dev)
+{
+	int features = dev->features;
+
+	if (skb->protocol == htons(ETH_P_8021Q) || vlan_tx_tag_present(skb))
+		features &= dev->vlan_features;
+
+	return skb_is_nonlinear(skb) &&
+	       ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) ||
+		(skb_shinfo(skb)->nr_frags && (!(features & NETIF_F_SG) ||
+					      illegal_highdma(dev, skb))));
+}
+
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			struct netdev_queue *txq)
 {
@@ -1943,14 +2007,40 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 
 		skb_orphan_try(skb);
 
+		if (vlan_tx_tag_present(skb) &&
+		    !(dev->features & NETIF_F_HW_VLAN_TX)) {
+			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
+			if (unlikely(!skb))
+				goto out;
+
+			skb->vlan_tci = 0;
+		}
+
 		if (netif_needs_gso(dev, skb)) {
 			if (unlikely(dev_gso_segment(skb)))
 				goto out_kfree_skb;
 			if (skb->next)
 				goto gso;
+		} else {
+			if (skb_needs_linearize(skb, dev) &&
+			    __skb_linearize(skb))
+				goto out_kfree_skb;
+
+			/* If packet is not checksummed and device does not
+			 * support checksumming for this protocol, complete
+			 * checksumming here.
+			 */
+			if (skb->ip_summed == CHECKSUM_PARTIAL) {
+				skb_set_transport_header(skb, skb->csum_start -
+					      skb_headroom(skb));
+				if (!dev_can_checksum(dev, skb) &&
+				     skb_checksum_help(skb))
+					goto out_kfree_skb;
+			}
 		}
 
 		rc = ops->ndo_start_xmit(skb, dev);
+		trace_net_dev_xmit(skb, rc);
 		if (rc == NETDEV_TX_OK)
 			txq_trans_update(txq);
 		return rc;
@@ -1971,6 +2061,7 @@ gso:
 			skb_dst_drop(nskb);
 
 		rc = ops->ndo_start_xmit(nskb, dev);
+		trace_net_dev_xmit(nskb, rc);
 		if (unlikely(rc != NETDEV_TX_OK)) {
 			if (rc & ~NETDEV_TX_MASK)
 				goto out_kfree_gso_skb;
@@ -1988,6 +2079,7 @@ out_kfree_gso_skb:
 		skb->destructor = DEV_GSO_CB(skb)->destructor;
 out_kfree_skb:
 	kfree_skb(skb);
+out:
 	return rc;
 }
 
@@ -2031,16 +2123,16 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 					struct sk_buff *skb)
 {
 	int queue_index;
-	struct sock *sk = skb->sk;
+	const struct net_device_ops *ops = dev->netdev_ops;
 
-	queue_index = sk_tx_queue_get(sk);
-	if (queue_index < 0) {
-		const struct net_device_ops *ops = dev->netdev_ops;
+	if (ops->ndo_select_queue) {
+		queue_index = ops->ndo_select_queue(dev, skb);
+		queue_index = dev_cap_txqueue(dev, queue_index);
+	} else {
+		struct sock *sk = skb->sk;
+		queue_index = sk_tx_queue_get(sk);
+		if (queue_index < 0) {
 
-		if (ops->ndo_select_queue) {
-			queue_index = ops->ndo_select_queue(dev, skb);
-			queue_index = dev_cap_txqueue(dev, queue_index);
-		} else {
 			queue_index = 0;
 			if (dev->real_num_tx_queues > 1)
 				queue_index = skb_tx_hash(dev, skb);
@@ -2063,14 +2155,24 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				 struct netdev_queue *txq)
 {
 	spinlock_t *root_lock = qdisc_lock(q);
+	bool contended = qdisc_is_running(q);
 	int rc;
 
+	/*
+	 * Heuristic to force contended enqueues to serialize on a
+	 * separate lock before trying to get qdisc main lock.
+	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
+	 * and dequeue packets faster.
+	 */
+	if (unlikely(contended))
+		spin_lock(&q->busylock);
+
 	spin_lock(root_lock);
 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
 		kfree_skb(skb);
 		rc = NET_XMIT_DROP;
 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
-		   !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
+		   qdisc_run_begin(q)) {
 		/*
 		 * This is a work-conserving queue; there are no old skbs
 		 * waiting to be sent out; and the qdisc is not running -
@@ -2079,36 +2181,35 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
 			skb_dst_force(skb);
 		__qdisc_update_bstats(q, skb->len);
-		if (sch_direct_xmit(skb, q, dev, txq, root_lock))
+		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
+			if (unlikely(contended)) {
+				spin_unlock(&q->busylock);
+				contended = false;
+			}
 			__qdisc_run(q);
-		else
-			clear_bit(__QDISC_STATE_RUNNING, &q->state);
+		} else
+			qdisc_run_end(q);
 
 		rc = NET_XMIT_SUCCESS;
 	} else {
 		skb_dst_force(skb);
 		rc = qdisc_enqueue_root(skb, q);
-		qdisc_run(q);
+		if (qdisc_run_begin(q)) {
+			if (unlikely(contended)) {
+				spin_unlock(&q->busylock);
+				contended = false;
+			}
+			__qdisc_run(q);
+		}
 	}
 	spin_unlock(root_lock);
-
+	if (unlikely(contended))
+		spin_unlock(&q->busylock);
 	return rc;
 }
 
-/*
- * Returns true if either:
- *	1. skb has frag_list and the device doesn't support FRAGLIST, or
- *	2. skb is fragmented and the device does not support SG, or if
- *	   at least one of fragments is in highmem and device does not
- *	   support DMA from it.
- */
-static inline int skb_needs_linearize(struct sk_buff *skb,
-				      struct net_device *dev)
-{
-	return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
-	       (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
-					      illegal_highdma(dev, skb)));
-}
+static DEFINE_PER_CPU(int, xmit_recursion);
+#define RECURSION_LIMIT 10
 
 /**
  *	dev_queue_xmit - transmit a buffer
@@ -2142,25 +2243,6 @@ int dev_queue_xmit(struct sk_buff *skb)
 	struct Qdisc *q;
 	int rc = -ENOMEM;
 
-	/* GSO will handle the following emulations directly. */
-	if (netif_needs_gso(dev, skb))
-		goto gso;
-
-	/* Convert a paged skb to linear, if required */
-	if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
-		goto out_kfree_skb;
-
-	/* If packet is not checksummed and device does not support
-	 * checksumming for this protocol, complete checksumming here.
-	 */
-	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		skb_set_transport_header(skb, skb->csum_start -
-					      skb_headroom(skb));
-		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
-			goto out_kfree_skb;
-	}
-
-gso:
 	/* Disable soft irqs for various locks below. Also
 	 * stops preemption for RCU.
 	 */
@@ -2172,6 +2254,7 @@ gso:
 #ifdef CONFIG_NET_CLS_ACT
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
 #endif
+	trace_net_dev_queue(skb);
 	if (q->enqueue) {
 		rc = __dev_xmit_skb(skb, q, dev, txq);
 		goto out;
@@ -2194,10 +2277,15 @@ gso:
 
 		if (txq->xmit_lock_owner != cpu) {
 
+			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
+				goto recursion_alert;
+
 			HARD_TX_LOCK(dev, txq, cpu);
 
 			if (!netif_tx_queue_stopped(txq)) {
+				__this_cpu_inc(xmit_recursion);
 				rc = dev_hard_start_xmit(skb, dev, txq);
+				__this_cpu_dec(xmit_recursion);
 				if (dev_xmit_complete(rc)) {
 					HARD_TX_UNLOCK(dev, txq);
 					goto out;
@@ -2209,7 +2297,9 @@ gso:
 				       "queue packet!\n", dev->name);
 		} else {
 			/* Recursion is detected! It is possible,
-			 * unfortunately */
+			 * unfortunately
+			 */
+recursion_alert:
 			if (net_ratelimit())
 				printk(KERN_CRIT "Dead loop on virtual device "
 				       "%s, fix it urgently!\n", dev->name);
@@ -2219,7 +2309,6 @@ gso:
 	rc = -ENETDOWN;
 	rcu_read_unlock_bh();
 
-out_kfree_skb:
 	kfree_skb(skb);
 	return rc;
 out:
@@ -2246,69 +2335,44 @@ static inline void ____napi_schedule(struct softnet_data *sd,
 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 }
 
-#ifdef CONFIG_RPS
-
-/* One global table that all flow-based protocols share. */
-struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
-EXPORT_SYMBOL(rps_sock_flow_table);
-
 /*
- * get_rps_cpu is called from netif_receive_skb and returns the target
- * CPU from the RPS map of the receiving queue for a given skb.
- * rcu_read_lock must be held on entry.
+ * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
+ * and src/dst port numbers. Returns a non-zero hash number on success
+ * and 0 on failure.
  */
-static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
-		       struct rps_dev_flow **rflowp)
+__u32 __skb_get_rxhash(struct sk_buff *skb)
 {
+	int nhoff, hash = 0, poff;
 	struct ipv6hdr *ip6;
 	struct iphdr *ip;
-	struct netdev_rx_queue *rxqueue;
-	struct rps_map *map;
-	struct rps_dev_flow_table *flow_table;
-	struct rps_sock_flow_table *sock_flow_table;
-	int cpu = -1;
 	u8 ip_proto;
-	u16 tcpu;
 	u32 addr1, addr2, ihl;
 	union {
 		u32 v32;
 		u16 v16[2];
 	} ports;
 
-	if (skb_rx_queue_recorded(skb)) {
-		u16 index = skb_get_rx_queue(skb);
-		if (unlikely(index >= dev->num_rx_queues)) {
-			WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
-				"on queue %u, but number of RX queues is %u\n",
-				dev->name, index, dev->num_rx_queues);
-			goto done;
-		}
-		rxqueue = dev->_rx + index;
-	} else
-		rxqueue = dev->_rx;
-
-	if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
-		goto done;
-
-	if (skb->rxhash)
-		goto got_hash; /* Skip hash computation on packet header */
+	nhoff = skb_network_offset(skb);
 
 	switch (skb->protocol) {
 	case __constant_htons(ETH_P_IP):
-		if (!pskb_may_pull(skb, sizeof(*ip)))
+		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
 			goto done;
 
-		ip = (struct iphdr *) skb->data;
-		ip_proto = ip->protocol;
+		ip = (struct iphdr *) (skb->data + nhoff);
+		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
+			ip_proto = 0;
+		else
+			ip_proto = ip->protocol;
 		addr1 = (__force u32) ip->saddr;
 		addr2 = (__force u32) ip->daddr;
 		ihl = ip->ihl;
 		break;
 	case __constant_htons(ETH_P_IPV6):
-		if (!pskb_may_pull(skb, sizeof(*ip6)))
+		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
 			goto done;
 
-		ip6 = (struct ipv6hdr *) skb->data;
+		ip6 = (struct ipv6hdr *) (skb->data + nhoff);
 		ip_proto = ip6->nexthdr;
 		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
@@ -2317,33 +2381,81 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 	default:
 		goto done;
 	}
-	switch (ip_proto) {
-	case IPPROTO_TCP:
-	case IPPROTO_UDP:
-	case IPPROTO_DCCP:
-	case IPPROTO_ESP:
-	case IPPROTO_AH:
-	case IPPROTO_SCTP:
-	case IPPROTO_UDPLITE:
-		if (pskb_may_pull(skb, (ihl * 4) + 4)) {
-			ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
+
+	ports.v32 = 0;
+	poff = proto_ports_offset(ip_proto);
+	if (poff >= 0) {
+		nhoff += ihl * 4 + poff;
+		if (pskb_may_pull(skb, nhoff + 4)) {
+			ports.v32 = * (__force u32 *) (skb->data + nhoff);
 			if (ports.v16[1] < ports.v16[0])
 				swap(ports.v16[0], ports.v16[1]);
-			break;
 		}
-	default:
-		ports.v32 = 0;
-		break;
 	}
 
 	/* get a consistent hash (same value on both flow directions) */
 	if (addr2 < addr1)
 		swap(addr1, addr2);
-	skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
-	if (!skb->rxhash)
-		skb->rxhash = 1;
 
-got_hash:
+	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
+	if (!hash)
+		hash = 1;
+
+done:
+	return hash;
+}
+EXPORT_SYMBOL(__skb_get_rxhash);
+
+#ifdef CONFIG_RPS
+
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
+EXPORT_SYMBOL(rps_sock_flow_table);
+
+/*
+ * get_rps_cpu is called from netif_receive_skb and returns the target
+ * CPU from the RPS map of the receiving queue for a given skb.
+ * rcu_read_lock must be held on entry.
+ */
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+		       struct rps_dev_flow **rflowp)
+{
+	struct netdev_rx_queue *rxqueue;
+	struct rps_map *map;
+	struct rps_dev_flow_table *flow_table;
+	struct rps_sock_flow_table *sock_flow_table;
+	int cpu = -1;
+	u16 tcpu;
+
+	if (skb_rx_queue_recorded(skb)) {
+		u16 index = skb_get_rx_queue(skb);
+		if (unlikely(index >= dev->real_num_rx_queues)) {
+			WARN_ONCE(dev->real_num_rx_queues > 1,
+				  "%s received packet on queue %u, but number "
+				  "of RX queues is %u\n",
+				  dev->name, index, dev->real_num_rx_queues);
+			goto done;
+		}
+		rxqueue = dev->_rx + index;
+	} else
+		rxqueue = dev->_rx;
+
+	map = rcu_dereference(rxqueue->rps_map);
+	if (map) {
+		if (map->len == 1) {
+			tcpu = map->cpus[0];
+			if (cpu_online(tcpu))
+				cpu = tcpu;
+			goto done;
+		}
+	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
+		goto done;
+	}
+
+	skb_reset_network_header(skb);
+	if (!skb_get_rxhash(skb))
+		goto done;
+
 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 	if (flow_table && sock_flow_table) {
@@ -2383,7 +2495,6 @@ got_hash:
 		}
 	}
 
-	map = rcu_dereference(rxqueue->rps_map);
 	if (map) {
 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
 
@@ -2469,6 +2580,7 @@ enqueue:
 
 	local_irq_restore(flags);
 
+	atomic_long_inc(&skb->dev->rx_dropped);
 	kfree_skb(skb);
 	return NET_RX_DROP;
 }
@@ -2499,11 +2611,13 @@ int netif_rx(struct sk_buff *skb)
 	if (netdev_tstamp_prequeue)
 		net_timestamp_check(skb);
 
+	trace_netif_rx(skb);
 #ifdef CONFIG_RPS
 	{
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
 		int cpu;
 
+		preempt_disable();
 		rcu_read_lock();
 
 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -2513,6 +2627,7 @@ int netif_rx(struct sk_buff *skb)
 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 
 		rcu_read_unlock();
+		preempt_enable();
 	}
 #else
 	{
@@ -2556,6 +2671,7 @@ static void net_tx_action(struct softirq_action *h)
 			clist = clist->next;
 
 			WARN_ON(atomic_read(&skb->users));
+			trace_kfree_skb(skb, net_tx_action);
 			__kfree_skb(skb);
 		}
 	}
@@ -2604,70 +2720,14 @@ static inline int deliver_skb(struct sk_buff *skb,
 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 
-#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
-
-#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
+#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
+    (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
 /* This hook is defined here for ATM LANE */
 int (*br_fdb_test_addr_hook)(struct net_device *dev,
 			     unsigned char *addr) __read_mostly;
 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 #endif
 
-/*
- * If bridge module is loaded call bridging hook.
- *  returns NULL if packet was consumed.
- */
-struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
-					struct sk_buff *skb) __read_mostly;
-EXPORT_SYMBOL_GPL(br_handle_frame_hook);
-
-static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
-					    struct packet_type **pt_prev, int *ret,
-					    struct net_device *orig_dev)
-{
-	struct net_bridge_port *port;
-
-	if (skb->pkt_type == PACKET_LOOPBACK ||
-	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
-		return skb;
-
-	if (*pt_prev) {
-		*ret = deliver_skb(skb, *pt_prev, orig_dev);
-		*pt_prev = NULL;
-	}
-
-	return br_handle_frame_hook(port, skb);
-}
-#else
-#define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
-#endif
-
-#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
-struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *p,
-					     struct sk_buff *skb) __read_mostly;
-EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
-
-static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
-					     struct packet_type **pt_prev,
-					     int *ret,
-					     struct net_device *orig_dev)
-{
-	struct macvlan_port *port;
-
-	port = rcu_dereference(skb->dev->macvlan_port);
-	if (!port)
-		return skb;
-
-	if (*pt_prev) {
-		*ret = deliver_skb(skb, *pt_prev, orig_dev);
-		*pt_prev = NULL;
-	}
-	return macvlan_handle_frame_hook(port, skb);
-}
-#else
-#define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
-#endif
-
 #ifdef CONFIG_NET_CLS_ACT
 /* TODO: Maybe we should just force sch_ingress to be compiled in
  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
@@ -2677,26 +2737,23 @@ static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
  * the ingress scheduler, you just cant add policies on ingress.
  *
  */
-static int ing_filter(struct sk_buff *skb)
+static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
 {
 	struct net_device *dev = skb->dev;
 	u32 ttl = G_TC_RTTL(skb->tc_verd);
-	struct netdev_queue *rxq;
 	int result = TC_ACT_OK;
 	struct Qdisc *q;
 
-	if (MAX_RED_LOOP < ttl++) {
-		printk(KERN_WARNING
-		       "Redir loop detected Dropping packet (%d->%d)\n",
-		       skb->skb_iif, dev->ifindex);
+	if (unlikely(MAX_RED_LOOP < ttl++)) {
+		if (net_ratelimit())
+			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
+			       skb->skb_iif, dev->ifindex);
 		return TC_ACT_SHOT;
 	}
 
 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
 
-	rxq = &dev->rx_queue;
-
 	q = rxq->qdisc;
 	if (q != &noop_qdisc) {
 		spin_lock(qdisc_lock(q));
@@ -2712,18 +2769,17 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 					 struct packet_type **pt_prev,
 					 int *ret, struct net_device *orig_dev)
 {
-	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
+	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
+
+	if (!rxq || rxq->qdisc == &noop_qdisc)
 		goto out;
 
 	if (*pt_prev) {
 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 		*pt_prev = NULL;
-	} else {
-		/* Huh? Why does turning on AF_PACKET affect this? */
-		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
 	}
 
-	switch (ing_filter(skb)) {
+	switch (ing_filter(skb, rxq)) {
 	case TC_ACT_SHOT:
 	case TC_ACT_STOLEN:
 		kfree_skb(skb);
@@ -2736,32 +2792,50 @@ out:
 }
 #endif
 
-/*
- * 	netif_nit_deliver - deliver received packets to network taps
- * 	@skb: buffer
+/**
+ *	netdev_rx_handler_register - register receive handler
+ *	@dev: device to register a handler for
+ *	@rx_handler: receive handler to register
+ *	@rx_handler_data: data pointer that is used by rx handler
+ *
+ *	Register a receive hander for a device. This handler will then be
+ *	called from __netif_receive_skb. A negative errno code is returned
+ *	on a failure.
  *
- * 	This function is used to deliver incoming packets to network
- * 	taps. It should be used when the normal netif_receive_skb path
- * 	is bypassed, for example because of VLAN acceleration.
+ *	The caller must hold the rtnl_mutex.
  */
-void netif_nit_deliver(struct sk_buff *skb)
+int netdev_rx_handler_register(struct net_device *dev,
+			       rx_handler_func_t *rx_handler,
+			       void *rx_handler_data)
 {
-	struct packet_type *ptype;
+	ASSERT_RTNL();
 
-	if (list_empty(&ptype_all))
-		return;
+	if (dev->rx_handler)
+		return -EBUSY;
 
-	skb_reset_network_header(skb);
-	skb_reset_transport_header(skb);
-	skb->mac_len = skb->network_header - skb->mac_header;
+	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
+	rcu_assign_pointer(dev->rx_handler, rx_handler);
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(ptype, &ptype_all, list) {
-		if (!ptype->dev || ptype->dev == skb->dev)
-			deliver_skb(skb, ptype, skb->dev);
-	}
-	rcu_read_unlock();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
+
+/**
+ *	netdev_rx_handler_unregister - unregister receive handler
+ *	@dev: device to unregister a handler from
+ *
+ *	Unregister a receive hander from a device.
+ *
+ *	The caller must hold the rtnl_mutex.
+ */
+void netdev_rx_handler_unregister(struct net_device *dev)
+{
+
+	ASSERT_RTNL();
+	rcu_assign_pointer(dev->rx_handler, NULL);
+	rcu_assign_pointer(dev->rx_handler_data, NULL);
 }
+EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 
 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
 					      struct net_device *master)
@@ -2784,7 +2858,8 @@ int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
 	if (master->priv_flags & IFF_MASTER_ARPMON)
 		dev->last_rx = jiffies;
 
-	if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
+	if ((master->priv_flags & IFF_MASTER_ALB) &&
+	    (master->priv_flags & IFF_BRIDGE_PORT)) {
 		/* Do address unmangle. The local destination address
 		 * will be always the one master has. Provides the right
 		 * functionality in a bridge.
@@ -2815,6 +2890,7 @@ EXPORT_SYMBOL(__skb_bond_should_drop);
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
+	rx_handler_func_t *rx_handler;
 	struct net_device *orig_dev;
 	struct net_device *master;
 	struct net_device *null_or_orig;
@@ -2825,8 +2901,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	if (!netdev_tstamp_prequeue)
 		net_timestamp_check(skb);
 
-	if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
-		return NET_RX_SUCCESS;
+	trace_netif_receive_skb(skb);
 
 	/* if we've gotten here through NAPI, check netpoll */
 	if (netpoll_receive_skb(skb))
@@ -2840,8 +2915,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	 * be delivered to pkt handlers that are exact matches.  Also
 	 * the deliver_no_wcard flag will be set.  If packet handlers
 	 * are sensitive to duplicate packets these skbs will need to
-	 * be dropped at the handler.  The vlan accel path may have
-	 * already set the deliver_no_wcard flag.
+	 * be dropped at the handler.
 	 */
 	null_or_orig = NULL;
 	orig_dev = skb->dev;
@@ -2856,8 +2930,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 			skb->dev = master;
 	}
 
-	__get_cpu_var(softnet_data).processed++;
-
+	__this_cpu_inc(softnet_data.processed);
 	skb_reset_network_header(skb);
 	skb_reset_transport_header(skb);
 	skb->mac_len = skb->network_header - skb->mac_header;
@@ -2889,12 +2962,29 @@ static int __netif_receive_skb(struct sk_buff *skb)
 ncls:
 #endif
 
-	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
-	if (!skb)
-		goto out;
-	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
-	if (!skb)
-		goto out;
+	/* Handle special case of bridge or macvlan */
+	rx_handler = rcu_dereference(skb->dev->rx_handler);
+	if (rx_handler) {
+		if (pt_prev) {
+			ret = deliver_skb(skb, pt_prev, orig_dev);
+			pt_prev = NULL;
+		}
+		skb = rx_handler(skb);
+		if (!skb)
+			goto out;
+	}
+
+	if (vlan_tx_tag_present(skb)) {
+		if (pt_prev) {
+			ret = deliver_skb(skb, pt_prev, orig_dev);
+			pt_prev = NULL;
+		}
+		if (vlan_hwaccel_do_receive(&skb)) {
+			ret = __netif_receive_skb(skb);
+			goto out;
+		} else if (unlikely(!skb))
+			goto out;
+	}
 
 	/*
 	 * Make sure frames received on VLAN interfaces stacked on
@@ -2923,6 +3013,7 @@ ncls:
 	if (pt_prev) {
 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 	} else {
+		atomic_long_inc(&skb->dev->rx_dropped);
 		kfree_skb(skb);
 		/* Jamal, now you will not able to escape explaining
 		 * me how you were going to use this. :-)
@@ -2955,6 +3046,9 @@ int netif_receive_skb(struct sk_buff *skb)
 	if (netdev_tstamp_prequeue)
 		net_timestamp_check(skb);
 
+	if (skb_defer_rx_timestamp(skb))
+		return NET_RX_SUCCESS;
+
 #ifdef CONFIG_RPS
 	{
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
@@ -3040,7 +3134,7 @@ out:
 	return netif_receive_skb(skb);
 }
 
-static void napi_gro_flush(struct napi_struct *napi)
+inline void napi_gro_flush(struct napi_struct *napi)
 {
 	struct sk_buff *skb, *next;
 
@@ -3053,6 +3147,7 @@ static void napi_gro_flush(struct napi_struct *napi)
 	napi->gro_count = 0;
 	napi->gro_list = NULL;
 }
+EXPORT_SYMBOL(napi_gro_flush);
 
 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
@@ -3064,10 +3159,10 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	int mac_len;
 	enum gro_result ret;
 
-	if (!(skb->dev->features & NETIF_F_GRO))
+	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
 		goto normal;
 
-	if (skb_is_gso(skb) || skb_has_frags(skb))
+	if (skb_is_gso(skb) || skb_has_frag_list(skb))
 		goto normal;
 
 	rcu_read_lock();
@@ -3133,7 +3228,7 @@ pull:
 			put_page(skb_shinfo(skb)->frags[0].page);
 			memmove(skb_shinfo(skb)->frags,
 				skb_shinfo(skb)->frags + 1,
-				--skb_shinfo(skb)->nr_frags);
+				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
 		}
 	}
 
@@ -3146,19 +3241,19 @@ normal:
 }
 EXPORT_SYMBOL(dev_gro_receive);
 
-static gro_result_t
+static inline gro_result_t
 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
 	struct sk_buff *p;
 
-	if (netpoll_rx_on(skb))
-		return GRO_NORMAL;
-
 	for (p = napi->gro_list; p; p = p->next) {
-		NAPI_GRO_CB(p)->same_flow =
-			(p->dev == skb->dev) &&
-			!compare_ether_header(skb_mac_header(p),
+		unsigned long diffs;
+
+		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
+		diffs |= p->vlan_tci ^ skb->vlan_tci;
+		diffs |= compare_ether_header(skb_mac_header(p),
 					      skb_gro_mac_header(skb));
+		NAPI_GRO_CB(p)->same_flow = !diffs;
 		NAPI_GRO_CB(p)->flush = 0;
 	}
 
@@ -3211,14 +3306,14 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(napi_gro_receive);
 
-void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
+static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 {
 	__skb_pull(skb, skb_headlen(skb));
 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
+	skb->vlan_tci = 0;
 
 	napi->skb = skb;
 }
-EXPORT_SYMBOL(napi_reuse_skb);
 
 struct sk_buff *napi_get_frags(struct napi_struct *napi)
 {
@@ -3719,10 +3814,11 @@ void dev_seq_stop(struct seq_file *seq, void *v)
 
 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
 {
-	const struct net_device_stats *stats = dev_get_stats(dev);
+	struct rtnl_link_stats64 temp;
+	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
 
-	seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
-		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
+	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
+		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
 		   dev->name, stats->rx_bytes, stats->rx_packets,
 		   stats->rx_errors,
 		   stats->rx_dropped + stats->rx_missed_errors,
@@ -4837,7 +4933,7 @@ static void rollback_registered_many(struct list_head *head)
 	dev = list_first_entry(head, struct net_device, unreg_list);
 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
 
-	synchronize_net();
+	rcu_barrier();
 
 	list_for_each_entry(dev, head, unreg_list)
 		dev_put(dev);
@@ -4851,21 +4947,6 @@ static void rollback_registered(struct net_device *dev)
 	rollback_registered_many(&single);
 }
 
-static void __netdev_init_queue_locks_one(struct net_device *dev,
-					  struct netdev_queue *dev_queue,
-					  void *_unused)
-{
-	spin_lock_init(&dev_queue->_xmit_lock);
-	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
-	dev_queue->xmit_lock_owner = -1;
-}
-
-static void netdev_init_queue_locks(struct net_device *dev)
-{
-	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
-	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
-}
-
 unsigned long netdev_fix_features(unsigned long features, const char *name)
 {
 	/* Fix illegal SG+CSUM combinations. */
@@ -4933,6 +5014,66 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
 }
 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
 
+static int netif_alloc_rx_queues(struct net_device *dev)
+{
+#ifdef CONFIG_RPS
+	unsigned int i, count = dev->num_rx_queues;
+	struct netdev_rx_queue *rx;
+
+	BUG_ON(count < 1);
+
+	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
+	if (!rx) {
+		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
+		return -ENOMEM;
+	}
+	dev->_rx = rx;
+
+	/*
+	 * Set a pointer to first element in the array which holds the
+	 * reference count.
+	 */
+	for (i = 0; i < count; i++)
+		rx[i].first = rx;
+#endif
+	return 0;
+}
+
+static int netif_alloc_netdev_queues(struct net_device *dev)
+{
+	unsigned int count = dev->num_tx_queues;
+	struct netdev_queue *tx;
+
+	BUG_ON(count < 1);
+
+	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
+	if (!tx) {
+		pr_err("netdev: Unable to allocate %u tx queues.\n",
+		       count);
+		return -ENOMEM;
+	}
+	dev->_tx = tx;
+	return 0;
+}
+
+static void netdev_init_one_queue(struct net_device *dev,
+				  struct netdev_queue *queue,
+				  void *_unused)
+{
+	queue->dev = dev;
+
+	/* Initialize queue lock */
+	spin_lock_init(&queue->_xmit_lock);
+	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
+	queue->xmit_lock_owner = -1;
+}
+
+static void netdev_init_queues(struct net_device *dev)
+{
+	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
+	spin_lock_init(&dev->tx_global_lock);
+}
+
 /**
  *	register_netdevice	- register a network device
  *	@dev: device to register
@@ -4966,28 +5107,19 @@ int register_netdevice(struct net_device *dev)
 
 	spin_lock_init(&dev->addr_list_lock);
 	netdev_set_addr_lockdep_class(dev);
-	netdev_init_queue_locks(dev);
 
 	dev->iflink = -1;
 
-#ifdef CONFIG_RPS
-	if (!dev->num_rx_queues) {
-		/*
-		 * Allocate a single RX queue if driver never called
-		 * alloc_netdev_mq
-		 */
+	ret = netif_alloc_rx_queues(dev);
+	if (ret)
+		goto out;
 
-		dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
-		if (!dev->_rx) {
-			ret = -ENOMEM;
-			goto out;
-		}
+	ret = netif_alloc_netdev_queues(dev);
+	if (ret)
+		goto out;
+
+	netdev_init_queues(dev);
 
-		dev->_rx->first = dev->_rx;
-		atomic_set(&dev->_rx->count, 1);
-		dev->num_rx_queues = 1;
-	}
-#endif
 	/* Init, if this function is available */
 	if (dev->netdev_ops->ndo_init) {
 		ret = dev->netdev_ops->ndo_init(dev);
@@ -5027,6 +5159,12 @@ int register_netdevice(struct net_device *dev)
 	if (dev->features & NETIF_F_SG)
 		dev->features |= NETIF_F_GSO;
 
+	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
+	 * vlan_dev_init() will do the dev->features check, so these features
+	 * are enabled only if supported by underlying device.
+	 */
+	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
+
 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
 	ret = notifier_to_errno(ret);
 	if (ret)
@@ -5097,9 +5235,6 @@ int init_dummy_netdev(struct net_device *dev)
 	 */
 	dev->reg_state = NETREG_DUMMY;
 
-	/* initialize the ref count */
-	atomic_set(&dev->refcnt, 1);
-
 	/* NAPI wants this */
 	INIT_LIST_HEAD(&dev->napi_list);
 
@@ -5107,6 +5242,11 @@ int init_dummy_netdev(struct net_device *dev)
 	set_bit(__LINK_STATE_PRESENT, &dev->state);
 	set_bit(__LINK_STATE_START, &dev->state);
 
+	/* Note : We dont allocate pcpu_refcnt for dummy devices,
+	 * because users of this 'device' dont need to change
+	 * its refcount.
+	 */
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(init_dummy_netdev);
@@ -5148,6 +5288,16 @@ out:
 }
 EXPORT_SYMBOL(register_netdev);
 
+int netdev_refcnt_read(const struct net_device *dev)
+{
+	int i, refcnt = 0;
+
+	for_each_possible_cpu(i)
+		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
+	return refcnt;
+}
+EXPORT_SYMBOL(netdev_refcnt_read);
+
 /*
  * netdev_wait_allrefs - wait until all references are gone.
  *
@@ -5162,11 +5312,14 @@ EXPORT_SYMBOL(register_netdev);
 static void netdev_wait_allrefs(struct net_device *dev)
 {
 	unsigned long rebroadcast_time, warning_time;
+	int refcnt;
 
 	linkwatch_forget_dev(dev);
 
 	rebroadcast_time = warning_time = jiffies;
-	while (atomic_read(&dev->refcnt) != 0) {
+	refcnt = netdev_refcnt_read(dev);
+
+	while (refcnt != 0) {
 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
 			rtnl_lock();
 
@@ -5193,11 +5346,13 @@ static void netdev_wait_allrefs(struct net_device *dev)
 
 		msleep(250);
 
+		refcnt = netdev_refcnt_read(dev);
+
 		if (time_after(jiffies, warning_time + 10 * HZ)) {
 			printk(KERN_EMERG "unregister_netdevice: "
 			       "waiting for %s to become free. Usage "
 			       "count = %d\n",
-			       dev->name, atomic_read(&dev->refcnt));
+			       dev->name, refcnt);
 			warning_time = jiffies;
 		}
 	}
@@ -5255,9 +5410,9 @@ void netdev_run_todo(void)
 		netdev_wait_allrefs(dev);
 
 		/* paranoia */
-		BUG_ON(atomic_read(&dev->refcnt));
-		WARN_ON(dev->ip_ptr);
-		WARN_ON(dev->ip6_ptr);
+		BUG_ON(netdev_refcnt_read(dev));
+		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
+		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
 		WARN_ON(dev->dn_ptr);
 
 		if (dev->destructor)
@@ -5271,20 +5426,22 @@ void netdev_run_todo(void)
 /**
  *	dev_txq_stats_fold - fold tx_queues stats
  *	@dev: device to get statistics from
- *	@stats: struct net_device_stats to hold results
+ *	@stats: struct rtnl_link_stats64 to hold results
  */
 void dev_txq_stats_fold(const struct net_device *dev,
-			struct net_device_stats *stats)
+			struct rtnl_link_stats64 *stats)
 {
-	unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
+	u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
 	unsigned int i;
 	struct netdev_queue *txq;
 
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		txq = netdev_get_tx_queue(dev, i);
+		spin_lock_bh(&txq->_xmit_lock);
 		tx_bytes   += txq->tx_bytes;
 		tx_packets += txq->tx_packets;
 		tx_dropped += txq->tx_dropped;
+		spin_unlock_bh(&txq->_xmit_lock);
 	}
 	if (tx_bytes || tx_packets || tx_dropped) {
 		stats->tx_bytes   = tx_bytes;
@@ -5294,38 +5451,72 @@ void dev_txq_stats_fold(const struct net_device *dev,
 }
 EXPORT_SYMBOL(dev_txq_stats_fold);
 
+/* Convert net_device_stats to rtnl_link_stats64.  They have the same
+ * fields in the same order, with only the type differing.
+ */
+static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
+				    const struct net_device_stats *netdev_stats)
+{
+#if BITS_PER_LONG == 64
+        BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
+        memcpy(stats64, netdev_stats, sizeof(*stats64));
+#else
+	size_t i, n = sizeof(*stats64) / sizeof(u64);
+	const unsigned long *src = (const unsigned long *)netdev_stats;
+	u64 *dst = (u64 *)stats64;
+
+	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
+		     sizeof(*stats64) / sizeof(u64));
+	for (i = 0; i < n; i++)
+		dst[i] = src[i];
+#endif
+}
+
 /**
  *	dev_get_stats	- get network device statistics
  *	@dev: device to get statistics from
+ *	@storage: place to store stats
  *
- *	Get network statistics from device. The device driver may provide
- *	its own method by setting dev->netdev_ops->get_stats; otherwise
- *	the internal statistics structure is used.
+ *	Get network statistics from device. Return @storage.
+ *	The device driver may provide its own method by setting
+ *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
+ *	otherwise the internal statistics structure is used.
  */
-const struct net_device_stats *dev_get_stats(struct net_device *dev)
+struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
+					struct rtnl_link_stats64 *storage)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 
-	if (ops->ndo_get_stats)
-		return ops->ndo_get_stats(dev);
-
-	dev_txq_stats_fold(dev, &dev->stats);
-	return &dev->stats;
+	if (ops->ndo_get_stats64) {
+		memset(storage, 0, sizeof(*storage));
+		ops->ndo_get_stats64(dev, storage);
+	} else if (ops->ndo_get_stats) {
+		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
+	} else {
+		netdev_stats_to_stats64(storage, &dev->stats);
+		dev_txq_stats_fold(dev, storage);
+	}
+	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
+	return storage;
 }
 EXPORT_SYMBOL(dev_get_stats);
 
-static void netdev_init_one_queue(struct net_device *dev,
-				  struct netdev_queue *queue,
-				  void *_unused)
+struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
 {
-	queue->dev = dev;
-}
+	struct netdev_queue *queue = dev_ingress_queue(dev);
 
-static void netdev_init_queues(struct net_device *dev)
-{
-	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
-	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
-	spin_lock_init(&dev->tx_global_lock);
+#ifdef CONFIG_NET_CLS_ACT
+	if (queue)
+		return queue;
+	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+	if (!queue)
+		return NULL;
+	netdev_init_one_queue(dev, queue, NULL);
+	queue->qdisc = &noop_qdisc;
+	queue->qdisc_sleeping = &noop_qdisc;
+	rcu_assign_pointer(dev->ingress_queue, queue);
+#endif
+	return queue;
 }
 
 /**
@@ -5342,17 +5533,18 @@ static void netdev_init_queues(struct net_device *dev)
 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 		void (*setup)(struct net_device *), unsigned int queue_count)
 {
-	struct netdev_queue *tx;
 	struct net_device *dev;
 	size_t alloc_size;
 	struct net_device *p;
-#ifdef CONFIG_RPS
-	struct netdev_rx_queue *rx;
-	int i;
-#endif
 
 	BUG_ON(strlen(name) >= sizeof(dev->name));
 
+	if (queue_count < 1) {
+		pr_err("alloc_netdev: Unable to allocate device "
+		       "with zero queues.\n");
+		return NULL;
+	}
+
 	alloc_size = sizeof(struct net_device);
 	if (sizeof_priv) {
 		/* ensure 32-byte alignment of private area */
@@ -5368,55 +5560,31 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 		return NULL;
 	}
 
-	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
-	if (!tx) {
-		printk(KERN_ERR "alloc_netdev: Unable to allocate "
-		       "tx qdiscs.\n");
-		goto free_p;
-	}
-
-#ifdef CONFIG_RPS
-	rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
-	if (!rx) {
-		printk(KERN_ERR "alloc_netdev: Unable to allocate "
-		       "rx queues.\n");
-		goto free_tx;
-	}
-
-	atomic_set(&rx->count, queue_count);
-
-	/*
-	 * Set a pointer to first element in the array which holds the
-	 * reference count.
-	 */
-	for (i = 0; i < queue_count; i++)
-		rx[i].first = rx;
-#endif
-
 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
 	dev->padded = (char *)dev - (char *)p;
 
+	dev->pcpu_refcnt = alloc_percpu(int);
+	if (!dev->pcpu_refcnt)
+		goto free_p;
+
 	if (dev_addr_init(dev))
-		goto free_rx;
+		goto free_pcpu;
 
 	dev_mc_init(dev);
 	dev_uc_init(dev);
 
 	dev_net_set(dev, &init_net);
 
-	dev->_tx = tx;
 	dev->num_tx_queues = queue_count;
 	dev->real_num_tx_queues = queue_count;
 
 #ifdef CONFIG_RPS
-	dev->_rx = rx;
 	dev->num_rx_queues = queue_count;
+	dev->real_num_rx_queues = queue_count;
 #endif
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
-	netdev_init_queues(dev);
-
 	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
 	dev->ethtool_ntuple_list.count = 0;
 	INIT_LIST_HEAD(&dev->napi_list);
@@ -5427,12 +5595,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	strcpy(dev->name, name);
 	return dev;
 
-free_rx:
-#ifdef CONFIG_RPS
-	kfree(rx);
-free_tx:
-#endif
-	kfree(tx);
+free_pcpu:
+	free_percpu(dev->pcpu_refcnt);
 free_p:
 	kfree(p);
 	return NULL;
@@ -5455,6 +5619,8 @@ void free_netdev(struct net_device *dev)
 
 	kfree(dev->_tx);
 
+	kfree(rcu_dereference_raw(dev->ingress_queue));
+
 	/* Flush device addresses */
 	dev_addr_flush(dev);
 
@@ -5464,6 +5630,9 @@ void free_netdev(struct net_device *dev)
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
+	free_percpu(dev->pcpu_refcnt);
+	dev->pcpu_refcnt = NULL;
+
 	/*  Compatibility with error handling in drivers */
 	if (dev->reg_state == NETREG_UNINITIALIZED) {
 		kfree((char *)dev - dev->padded);
@@ -5618,6 +5787,10 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 
 	/* Notify protocols, that we are about to destroy
 	   this device. They should clean all the things.
+
+	   Note that dev->reg_state stays at NETREG_REGISTERED.
+	   This is wanted because this way 8021q and macvlan know
+	   the device is just moving and can keep their slaves up.
 	*/
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
@@ -5815,6 +5988,68 @@ char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
 	return buffer;
 }
 
+static int __netdev_printk(const char *level, const struct net_device *dev,
+			   struct va_format *vaf)
+{
+	int r;
+
+	if (dev && dev->dev.parent)
+		r = dev_printk(level, dev->dev.parent, "%s: %pV",
+			       netdev_name(dev), vaf);
+	else if (dev)
+		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
+	else
+		r = printk("%s(NULL net_device): %pV", level, vaf);
+
+	return r;
+}
+
+int netdev_printk(const char *level, const struct net_device *dev,
+		  const char *format, ...)
+{
+	struct va_format vaf;
+	va_list args;
+	int r;
+
+	va_start(args, format);
+
+	vaf.fmt = format;
+	vaf.va = &args;
+
+	r = __netdev_printk(level, dev, &vaf);
+	va_end(args);
+
+	return r;
+}
+EXPORT_SYMBOL(netdev_printk);
+
+#define define_netdev_printk_level(func, level)			\
+int func(const struct net_device *dev, const char *fmt, ...)	\
+{								\
+	int r;							\
+	struct va_format vaf;					\
+	va_list args;						\
+								\
+	va_start(args, fmt);					\
+								\
+	vaf.fmt = fmt;						\
+	vaf.va = &args;						\
+								\
+	r = __netdev_printk(level, dev, &vaf);			\
+	va_end(args);						\
+								\
+	return r;						\
+}								\
+EXPORT_SYMBOL(func);
+
+define_netdev_printk_level(netdev_emerg, KERN_EMERG);
+define_netdev_printk_level(netdev_alert, KERN_ALERT);
+define_netdev_printk_level(netdev_crit, KERN_CRIT);
+define_netdev_printk_level(netdev_err, KERN_ERR);
+define_netdev_printk_level(netdev_warn, KERN_WARNING);
+define_netdev_printk_level(netdev_notice, KERN_NOTICE);
+define_netdev_printk_level(netdev_info, KERN_INFO);
+
 static void __net_exit netdev_exit(struct net *net)
 {
 	kfree(net->dev_name_head);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index ad41529fb60..36e603c78ce 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -223,6 +223,11 @@ static int set_all_monitor_traces(int state)
 
 	spin_lock(&trace_state_lock);
 
+	if (state == trace_state) {
+		rc = -EAGAIN;
+		goto out_unlock;
+	}
+
 	switch (state) {
 	case TRACE_ON:
 		rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
@@ -251,11 +256,12 @@ static int set_all_monitor_traces(int state)
 
 	if (!rc)
 		trace_state = state;
+	else
+		rc = -EINPROGRESS;
 
+out_unlock:
 	spin_unlock(&trace_state_lock);
 
-	if (rc)
-		return -EINPROGRESS;
 	return rc;
 }
 
@@ -341,9 +347,9 @@ static struct notifier_block dropmon_net_notifier = {
 
 static int __init init_net_drop_monitor(void)
 {
-	int cpu;
-	int rc, i, ret;
 	struct per_cpu_dm_data *data;
+	int cpu, rc;
+
 	printk(KERN_INFO "Initalizing network drop monitor service\n");
 
 	if (sizeof(void *) > 8) {
@@ -351,21 +357,12 @@ static int __init init_net_drop_monitor(void)
 		return -ENOSPC;
 	}
 
-	if (genl_register_family(&net_drop_monitor_family) < 0) {
+	rc = genl_register_family_with_ops(&net_drop_monitor_family,
+					   dropmon_ops,
+					   ARRAY_SIZE(dropmon_ops));
+	if (rc) {
 		printk(KERN_ERR "Could not create drop monitor netlink family\n");
-		return -EFAULT;
-	}
-
-	rc = -EFAULT;
-
-	for (i = 0; i < ARRAY_SIZE(dropmon_ops); i++) {
-		ret = genl_register_ops(&net_drop_monitor_family,
-					&dropmon_ops[i]);
-		if (ret) {
-			printk(KERN_CRIT "Failed to register operation %d\n",
-				dropmon_ops[i].cmd);
-			goto out_unreg;
-		}
+		return rc;
 	}
 
 	rc = register_netdevice_notifier(&dropmon_net_notifier);
diff --git a/net/core/dst.c b/net/core/dst.c
index 9920722cc82..8abe628b79f 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -168,7 +168,7 @@ void *dst_alloc(struct dst_ops *ops)
 {
 	struct dst_entry *dst;
 
-	if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) {
+	if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
 		if (ops->gc(ops))
 			return NULL;
 	}
@@ -183,7 +183,7 @@ void *dst_alloc(struct dst_ops *ops)
 #if RT_CACHE_DEBUG >= 2
 	atomic_inc(&dst_total);
 #endif
-	atomic_inc(&ops->entries);
+	dst_entries_add(ops, 1);
 	return dst;
 }
 EXPORT_SYMBOL(dst_alloc);
@@ -197,7 +197,6 @@ static void ___dst_free(struct dst_entry *dst)
 		dst->input = dst->output = dst_discard;
 	dst->obsolete = 2;
 }
-EXPORT_SYMBOL(__dst_free);
 
 void __dst_free(struct dst_entry *dst)
 {
@@ -213,6 +212,7 @@ void __dst_free(struct dst_entry *dst)
 	}
 	spin_unlock_bh(&dst_garbage.lock);
 }
+EXPORT_SYMBOL(__dst_free);
 
 struct dst_entry *dst_destroy(struct dst_entry * dst)
 {
@@ -228,15 +228,15 @@ again:
 	child = dst->child;
 
 	dst->hh = NULL;
-	if (hh && atomic_dec_and_test(&hh->hh_refcnt))
-		kfree(hh);
+	if (hh)
+		hh_cache_put(hh);
 
 	if (neigh) {
 		dst->neighbour = NULL;
 		neigh_release(neigh);
 	}
 
-	atomic_dec(&dst->ops->entries);
+	dst_entries_add(dst->ops, -1);
 
 	if (dst->ops->destroy)
 		dst->ops->destroy(dst);
@@ -271,13 +271,40 @@ void dst_release(struct dst_entry *dst)
 	if (dst) {
 		int newrefcnt;
 
-		smp_mb__before_atomic_dec();
 		newrefcnt = atomic_dec_return(&dst->__refcnt);
 		WARN_ON(newrefcnt < 0);
+		if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) {
+			dst = dst_destroy(dst);
+			if (dst)
+				__dst_free(dst);
+		}
 	}
 }
 EXPORT_SYMBOL(dst_release);
 
+/**
+ * skb_dst_set_noref - sets skb dst, without a reference
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was not taken on dst
+ * skb_dst_drop() should not dst_release() this dst
+ */
+void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
+{
+	WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+	/* If dst not in cache, we must take a reference, because
+	 * dst_release() will destroy dst as soon as its refcount becomes zero
+	 */
+	if (unlikely(dst->flags & DST_NOCACHE)) {
+		dst_hold(dst);
+		skb_dst_set(skb, dst);
+	} else {
+		skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
+	}
+}
+EXPORT_SYMBOL(skb_dst_set_noref);
+
 /* Dirty hack. We did it in 2.2 (in __dst_free),
  * we have _very_ good reasons not to repeat
  * this mistake in 2.3, but we have no choice
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 75e4ffeb8cc..956a9f4971c 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -19,6 +19,7 @@
 #include <linux/netdevice.h>
 #include <linux/bitops.h>
 #include <linux/uaccess.h>
+#include <linux/vmalloc.h>
 #include <linux/slab.h>
 
 /*
@@ -131,7 +132,8 @@ EXPORT_SYMBOL(ethtool_op_set_ufo);
  * NETIF_F_xxx values in include/linux/netdevice.h
  */
 static const u32 flags_dup_features =
-	(ETH_FLAG_LRO | ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH);
+	(ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE |
+	 ETH_FLAG_RXHASH);
 
 u32 ethtool_op_get_flags(struct net_device *dev)
 {
@@ -144,31 +146,13 @@ u32 ethtool_op_get_flags(struct net_device *dev)
 }
 EXPORT_SYMBOL(ethtool_op_get_flags);
 
-int ethtool_op_set_flags(struct net_device *dev, u32 data)
+int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported)
 {
-	const struct ethtool_ops *ops = dev->ethtool_ops;
-	unsigned long features = dev->features;
-
-	if (data & ETH_FLAG_LRO)
-		features |= NETIF_F_LRO;
-	else
-		features &= ~NETIF_F_LRO;
-
-	if (data & ETH_FLAG_NTUPLE) {
-		if (!ops->set_rx_ntuple)
-			return -EOPNOTSUPP;
-		features |= NETIF_F_NTUPLE;
-	} else {
-		/* safe to clear regardless */
-		features &= ~NETIF_F_NTUPLE;
-	}
-
-	if (data & ETH_FLAG_RXHASH)
-		features |= NETIF_F_RXHASH;
-	else
-		features &= ~NETIF_F_RXHASH;
+	if (data & ~supported)
+		return -EINVAL;
 
-	dev->features = features;
+	dev->features = ((dev->features & ~flags_dup_features) |
+			 (data & flags_dup_features));
 	return 0;
 }
 EXPORT_SYMBOL(ethtool_op_set_flags);
@@ -223,18 +207,24 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
 	struct ethtool_drvinfo info;
 	const struct ethtool_ops *ops = dev->ethtool_ops;
 
-	if (!ops->get_drvinfo)
-		return -EOPNOTSUPP;
-
 	memset(&info, 0, sizeof(info));
 	info.cmd = ETHTOOL_GDRVINFO;
-	ops->get_drvinfo(dev, &info);
+	if (ops && ops->get_drvinfo) {
+		ops->get_drvinfo(dev, &info);
+	} else if (dev->dev.parent && dev->dev.parent->driver) {
+		strlcpy(info.bus_info, dev_name(dev->dev.parent),
+			sizeof(info.bus_info));
+		strlcpy(info.driver, dev->dev.parent->driver->name,
+			sizeof(info.driver));
+	} else {
+		return -EOPNOTSUPP;
+	}
 
 	/*
 	 * this method of obtaining string set info is deprecated;
 	 * Use ETHTOOL_GSSET_INFO instead.
 	 */
-	if (ops->get_sset_count) {
+	if (ops && ops->get_sset_count) {
 		int rc;
 
 		rc = ops->get_sset_count(dev, ETH_SS_TEST);
@@ -247,9 +237,9 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
 		if (rc >= 0)
 			info.n_priv_flags = rc;
 	}
-	if (ops->get_regs_len)
+	if (ops && ops->get_regs_len)
 		info.regdump_len = ops->get_regs_len(dev);
-	if (ops->get_eeprom_len)
+	if (ops && ops->get_eeprom_len)
 		info.eedump_len = ops->get_eeprom_len(dev);
 
 	if (copy_to_user(useraddr, &info, sizeof(info)))
@@ -366,7 +356,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
 	if (info.cmd == ETHTOOL_GRXCLSRLALL) {
 		if (info.rule_cnt > 0) {
 			if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
-				rule_buf = kmalloc(info.rule_cnt * sizeof(u32),
+				rule_buf = kzalloc(info.rule_cnt * sizeof(u32),
 						   GFP_USER);
 			if (!rule_buf)
 				return -ENOMEM;
@@ -395,6 +385,80 @@ err_out:
 	return ret;
 }
 
+static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
+						     void __user *useraddr)
+{
+	struct ethtool_rxfh_indir *indir;
+	u32 table_size;
+	size_t full_size;
+	int ret;
+
+	if (!dev->ethtool_ops->get_rxfh_indir)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&table_size,
+			   useraddr + offsetof(struct ethtool_rxfh_indir, size),
+			   sizeof(table_size)))
+		return -EFAULT;
+
+	if (table_size >
+	    (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index))
+		return -ENOMEM;
+	full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size;
+	indir = kzalloc(full_size, GFP_USER);
+	if (!indir)
+		return -ENOMEM;
+
+	indir->cmd = ETHTOOL_GRXFHINDIR;
+	indir->size = table_size;
+	ret = dev->ethtool_ops->get_rxfh_indir(dev, indir);
+	if (ret)
+		goto out;
+
+	if (copy_to_user(useraddr, indir, full_size))
+		ret = -EFAULT;
+
+out:
+	kfree(indir);
+	return ret;
+}
+
+static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
+						     void __user *useraddr)
+{
+	struct ethtool_rxfh_indir *indir;
+	u32 table_size;
+	size_t full_size;
+	int ret;
+
+	if (!dev->ethtool_ops->set_rxfh_indir)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&table_size,
+			   useraddr + offsetof(struct ethtool_rxfh_indir, size),
+			   sizeof(table_size)))
+		return -EFAULT;
+
+	if (table_size >
+	    (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index))
+		return -ENOMEM;
+	full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size;
+	indir = kmalloc(full_size, GFP_USER);
+	if (!indir)
+		return -ENOMEM;
+
+	if (copy_from_user(indir, useraddr, full_size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = dev->ethtool_ops->set_rxfh_indir(dev, indir);
+
+out:
+	kfree(indir);
+	return ret;
+}
+
 static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
 			struct ethtool_rx_ntuple_flow_spec *spec,
 			struct ethtool_rx_ntuple_flow_spec_container *fsc)
@@ -423,6 +487,38 @@ static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
 	list->count++;
 }
 
+/*
+ * ethtool does not (or did not) set masks for flow parameters that are
+ * not specified, so if both value and mask are 0 then this must be
+ * treated as equivalent to a mask with all bits set.  Implement that
+ * here rather than in drivers.
+ */
+static void rx_ntuple_fix_masks(struct ethtool_rx_ntuple_flow_spec *fs)
+{
+	struct ethtool_tcpip4_spec *entry = &fs->h_u.tcp_ip4_spec;
+	struct ethtool_tcpip4_spec *mask = &fs->m_u.tcp_ip4_spec;
+
+	if (fs->flow_type != TCP_V4_FLOW &&
+	    fs->flow_type != UDP_V4_FLOW &&
+	    fs->flow_type != SCTP_V4_FLOW)
+		return;
+
+	if (!(entry->ip4src | mask->ip4src))
+		mask->ip4src = htonl(0xffffffff);
+	if (!(entry->ip4dst | mask->ip4dst))
+		mask->ip4dst = htonl(0xffffffff);
+	if (!(entry->psrc | mask->psrc))
+		mask->psrc = htons(0xffff);
+	if (!(entry->pdst | mask->pdst))
+		mask->pdst = htons(0xffff);
+	if (!(entry->tos | mask->tos))
+		mask->tos = 0xff;
+	if (!(fs->vlan_tag | fs->vlan_tag_mask))
+		fs->vlan_tag_mask = 0xffff;
+	if (!(fs->data | fs->data_mask))
+		fs->data_mask = 0xffffffffffffffffULL;
+}
+
 static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
 						    void __user *useraddr)
 {
@@ -437,6 +533,8 @@ static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
 	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
 		return -EFAULT;
 
+	rx_ntuple_fix_masks(&cmd.fs);
+
 	/*
 	 * Cache filter in dev struct for GET operation only if
 	 * the underlying driver doesn't have its own GET operation, and
@@ -482,7 +580,7 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
 
 	gstrings.len = ret;
 
-	data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
+	data = kzalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
 	if (!data)
 		return -ENOMEM;
 
@@ -611,19 +709,19 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
 			break;
 		case IP_USER_FLOW:
 			sprintf(p, "\tSrc IP addr: 0x%x\n",
-				fsc->fs.h_u.raw_ip4_spec.ip4src);
+				fsc->fs.h_u.usr_ip4_spec.ip4src);
 			p += ETH_GSTRING_LEN;
 			num_strings++;
 			sprintf(p, "\tSrc IP mask: 0x%x\n",
-				fsc->fs.m_u.raw_ip4_spec.ip4src);
+				fsc->fs.m_u.usr_ip4_spec.ip4src);
 			p += ETH_GSTRING_LEN;
 			num_strings++;
 			sprintf(p, "\tDest IP addr: 0x%x\n",
-				fsc->fs.h_u.raw_ip4_spec.ip4dst);
+				fsc->fs.h_u.usr_ip4_spec.ip4dst);
 			p += ETH_GSTRING_LEN;
 			num_strings++;
 			sprintf(p, "\tDest IP mask: 0x%x\n",
-				fsc->fs.m_u.raw_ip4_spec.ip4dst);
+				fsc->fs.m_u.usr_ip4_spec.ip4dst);
 			p += ETH_GSTRING_LEN;
 			num_strings++;
 			break;
@@ -719,7 +817,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
 	if (regs.len > reglen)
 		regs.len = reglen;
 
-	regbuf = kmalloc(reglen, GFP_USER);
+	regbuf = vmalloc(reglen);
 	if (!regbuf)
 		return -ENOMEM;
 
@@ -734,7 +832,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
 	ret = 0;
 
  out:
-	kfree(regbuf);
+	vfree(regbuf);
 	return ret;
 }
 
@@ -1119,8 +1217,11 @@ static int ethtool_set_gro(struct net_device *dev, char __user *useraddr)
 		return -EFAULT;
 
 	if (edata.data) {
-		if (!dev->ethtool_ops->get_rx_csum ||
-		    !dev->ethtool_ops->get_rx_csum(dev))
+		u32 rxcsum = dev->ethtool_ops->get_rx_csum ?
+				dev->ethtool_ops->get_rx_csum(dev) :
+				ethtool_op_get_rx_csum(dev);
+
+		if (!rxcsum)
 			return -EINVAL;
 		dev->features |= NETIF_F_GRO;
 	} else
@@ -1346,14 +1447,22 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	if (!dev || !netif_device_present(dev))
 		return -ENODEV;
 
-	if (!dev->ethtool_ops)
-		return -EOPNOTSUPP;
-
 	if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
 		return -EFAULT;
 
+	if (!dev->ethtool_ops) {
+		/* ETHTOOL_GDRVINFO does not require any driver support.
+		 * It is also unprivileged and does not change anything,
+		 * so we can take a shortcut to it. */
+		if (ethcmd == ETHTOOL_GDRVINFO)
+			return ethtool_get_drvinfo(dev, useraddr);
+		else
+			return -EOPNOTSUPP;
+	}
+
 	/* Allow some commands to be done by anyone */
 	switch (ethcmd) {
+	case ETHTOOL_GSET:
 	case ETHTOOL_GDRVINFO:
 	case ETHTOOL_GMSGLVL:
 	case ETHTOOL_GCOALESCE:
@@ -1563,6 +1672,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GSSET_INFO:
 		rc = ethtool_get_sset_info(dev, useraddr);
 		break;
+	case ETHTOOL_GRXFHINDIR:
+		rc = ethtool_get_rxfh_indir(dev, useraddr);
+		break;
+	case ETHTOOL_SRXFHINDIR:
+		rc = ethtool_set_rxfh_indir(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 42e84e08a1b..82a4369ae15 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -144,7 +144,7 @@ fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net)
 }
 EXPORT_SYMBOL_GPL(fib_rules_register);
 
-void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
+static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
 {
 	struct fib_rule *rule, *tmp;
 
@@ -153,7 +153,6 @@ void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
 		fib_rule_put(rule);
 	}
 }
-EXPORT_SYMBOL_GPL(fib_rules_cleanup_ops);
 
 static void fib_rules_put_rcu(struct rcu_head *head)
 {
@@ -182,7 +181,8 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
 {
 	int ret = 0;
 
-	if (rule->iifindex && (rule->iifindex != fl->iif))
+	if (rule->iifindex && (rule->iifindex != fl->iif) &&
+	    !(fl->flags & FLOWI_FLAG_MATCH_ANY_IIF))
 		goto out;
 
 	if (rule->oifindex && (rule->oifindex != fl->oif))
@@ -225,9 +225,12 @@ jumped:
 			err = ops->action(rule, fl, flags, arg);
 
 		if (err != -EAGAIN) {
-			fib_rule_get(rule);
-			arg->rule = rule;
-			goto out;
+			if ((arg->flags & FIB_LOOKUP_NOREF) ||
+			    likely(atomic_inc_not_zero(&rule->refcnt))) {
+				arg->rule = rule;
+				goto out;
+			}
+			break;
 		}
 	}
 
@@ -348,12 +351,12 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 
 		list_for_each_entry(r, &ops->rules_list, list) {
 			if (r->pref == rule->target) {
-				rule->ctarget = r;
+				RCU_INIT_POINTER(rule->ctarget, r);
 				break;
 			}
 		}
 
-		if (rule->ctarget == NULL)
+		if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
 			unresolved = 1;
 	} else if (rule->action == FR_ACT_GOTO)
 		goto errout_free;
@@ -370,6 +373,11 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 
 	fib_rule_get(rule);
 
+	if (last)
+		list_add_rcu(&rule->list, &last->list);
+	else
+		list_add_rcu(&rule->list, &ops->rules_list);
+
 	if (ops->unresolved_rules) {
 		/*
 		 * There are unresolved goto rules in the list, check if
@@ -378,7 +386,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 		list_for_each_entry(r, &ops->rules_list, list) {
 			if (r->action == FR_ACT_GOTO &&
 			    r->target == rule->pref) {
-				BUG_ON(r->ctarget != NULL);
+				BUG_ON(rtnl_dereference(r->ctarget) != NULL);
 				rcu_assign_pointer(r->ctarget, rule);
 				if (--ops->unresolved_rules == 0)
 					break;
@@ -392,11 +400,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	if (unresolved)
 		ops->unresolved_rules++;
 
-	if (last)
-		list_add_rcu(&rule->list, &last->list);
-	else
-		list_add_rcu(&rule->list, &ops->rules_list);
-
 	notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid);
 	flush_route_cache(ops);
 	rules_ops_put(ops);
@@ -484,14 +487,13 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 		 */
 		if (ops->nr_goto_rules > 0) {
 			list_for_each_entry(tmp, &ops->rules_list, list) {
-				if (tmp->ctarget == rule) {
+				if (rtnl_dereference(tmp->ctarget) == rule) {
 					rcu_assign_pointer(tmp->ctarget, NULL);
 					ops->unresolved_rules++;
 				}
 			}
 		}
 
-		synchronize_rcu();
 		notify_rule_change(RTM_DELRULE, rule, ops, nlh,
 				   NETLINK_CB(skb).pid);
 		fib_rule_put(rule);
@@ -543,7 +545,8 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	frh->action = rule->action;
 	frh->flags = rule->flags;
 
-	if (rule->action == FR_ACT_GOTO && rule->ctarget == NULL)
+	if (rule->action == FR_ACT_GOTO &&
+	    rcu_dereference_raw(rule->ctarget) == NULL)
 		frh->flags |= FIB_RULE_UNRESOLVED;
 
 	if (rule->iifname[0]) {
diff --git a/net/core/filter.c b/net/core/filter.c
index da69fb728d3..7beaec36b54 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -89,8 +89,8 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
 	rcu_read_lock_bh();
 	filter = rcu_dereference_bh(sk->sk_filter);
 	if (filter) {
-		unsigned int pkt_len = sk_run_filter(skb, filter->insns,
-				filter->len);
+		unsigned int pkt_len = sk_run_filter(skb, filter->insns, filter->len);
+
 		err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
 	}
 	rcu_read_unlock_bh();
@@ -128,87 +128,87 @@ unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int
 		fentry = &filter[pc];
 
 		switch (fentry->code) {
-		case BPF_ALU|BPF_ADD|BPF_X:
+		case BPF_S_ALU_ADD_X:
 			A += X;
 			continue;
-		case BPF_ALU|BPF_ADD|BPF_K:
+		case BPF_S_ALU_ADD_K:
 			A += fentry->k;
 			continue;
-		case BPF_ALU|BPF_SUB|BPF_X:
+		case BPF_S_ALU_SUB_X:
 			A -= X;
 			continue;
-		case BPF_ALU|BPF_SUB|BPF_K:
+		case BPF_S_ALU_SUB_K:
 			A -= fentry->k;
 			continue;
-		case BPF_ALU|BPF_MUL|BPF_X:
+		case BPF_S_ALU_MUL_X:
 			A *= X;
 			continue;
-		case BPF_ALU|BPF_MUL|BPF_K:
+		case BPF_S_ALU_MUL_K:
 			A *= fentry->k;
 			continue;
-		case BPF_ALU|BPF_DIV|BPF_X:
+		case BPF_S_ALU_DIV_X:
 			if (X == 0)
 				return 0;
 			A /= X;
 			continue;
-		case BPF_ALU|BPF_DIV|BPF_K:
+		case BPF_S_ALU_DIV_K:
 			A /= fentry->k;
 			continue;
-		case BPF_ALU|BPF_AND|BPF_X:
+		case BPF_S_ALU_AND_X:
 			A &= X;
 			continue;
-		case BPF_ALU|BPF_AND|BPF_K:
+		case BPF_S_ALU_AND_K:
 			A &= fentry->k;
 			continue;
-		case BPF_ALU|BPF_OR|BPF_X:
+		case BPF_S_ALU_OR_X:
 			A |= X;
 			continue;
-		case BPF_ALU|BPF_OR|BPF_K:
+		case BPF_S_ALU_OR_K:
 			A |= fentry->k;
 			continue;
-		case BPF_ALU|BPF_LSH|BPF_X:
+		case BPF_S_ALU_LSH_X:
 			A <<= X;
 			continue;
-		case BPF_ALU|BPF_LSH|BPF_K:
+		case BPF_S_ALU_LSH_K:
 			A <<= fentry->k;
 			continue;
-		case BPF_ALU|BPF_RSH|BPF_X:
+		case BPF_S_ALU_RSH_X:
 			A >>= X;
 			continue;
-		case BPF_ALU|BPF_RSH|BPF_K:
+		case BPF_S_ALU_RSH_K:
 			A >>= fentry->k;
 			continue;
-		case BPF_ALU|BPF_NEG:
+		case BPF_S_ALU_NEG:
 			A = -A;
 			continue;
-		case BPF_JMP|BPF_JA:
+		case BPF_S_JMP_JA:
 			pc += fentry->k;
 			continue;
-		case BPF_JMP|BPF_JGT|BPF_K:
+		case BPF_S_JMP_JGT_K:
 			pc += (A > fentry->k) ? fentry->jt : fentry->jf;
 			continue;
-		case BPF_JMP|BPF_JGE|BPF_K:
+		case BPF_S_JMP_JGE_K:
 			pc += (A >= fentry->k) ? fentry->jt : fentry->jf;
 			continue;
-		case BPF_JMP|BPF_JEQ|BPF_K:
+		case BPF_S_JMP_JEQ_K:
 			pc += (A == fentry->k) ? fentry->jt : fentry->jf;
 			continue;
-		case BPF_JMP|BPF_JSET|BPF_K:
+		case BPF_S_JMP_JSET_K:
 			pc += (A & fentry->k) ? fentry->jt : fentry->jf;
 			continue;
-		case BPF_JMP|BPF_JGT|BPF_X:
+		case BPF_S_JMP_JGT_X:
 			pc += (A > X) ? fentry->jt : fentry->jf;
 			continue;
-		case BPF_JMP|BPF_JGE|BPF_X:
+		case BPF_S_JMP_JGE_X:
 			pc += (A >= X) ? fentry->jt : fentry->jf;
 			continue;
-		case BPF_JMP|BPF_JEQ|BPF_X:
+		case BPF_S_JMP_JEQ_X:
 			pc += (A == X) ? fentry->jt : fentry->jf;
 			continue;
-		case BPF_JMP|BPF_JSET|BPF_X:
+		case BPF_S_JMP_JSET_X:
 			pc += (A & X) ? fentry->jt : fentry->jf;
 			continue;
-		case BPF_LD|BPF_W|BPF_ABS:
+		case BPF_S_LD_W_ABS:
 			k = fentry->k;
 load_w:
 			ptr = load_pointer(skb, k, 4, &tmp);
@@ -217,7 +217,7 @@ load_w:
 				continue;
 			}
 			break;
-		case BPF_LD|BPF_H|BPF_ABS:
+		case BPF_S_LD_H_ABS:
 			k = fentry->k;
 load_h:
 			ptr = load_pointer(skb, k, 2, &tmp);
@@ -226,7 +226,7 @@ load_h:
 				continue;
 			}
 			break;
-		case BPF_LD|BPF_B|BPF_ABS:
+		case BPF_S_LD_B_ABS:
 			k = fentry->k;
 load_b:
 			ptr = load_pointer(skb, k, 1, &tmp);
@@ -235,54 +235,54 @@ load_b:
 				continue;
 			}
 			break;
-		case BPF_LD|BPF_W|BPF_LEN:
+		case BPF_S_LD_W_LEN:
 			A = skb->len;
 			continue;
-		case BPF_LDX|BPF_W|BPF_LEN:
+		case BPF_S_LDX_W_LEN:
 			X = skb->len;
 			continue;
-		case BPF_LD|BPF_W|BPF_IND:
+		case BPF_S_LD_W_IND:
 			k = X + fentry->k;
 			goto load_w;
-		case BPF_LD|BPF_H|BPF_IND:
+		case BPF_S_LD_H_IND:
 			k = X + fentry->k;
 			goto load_h;
-		case BPF_LD|BPF_B|BPF_IND:
+		case BPF_S_LD_B_IND:
 			k = X + fentry->k;
 			goto load_b;
-		case BPF_LDX|BPF_B|BPF_MSH:
+		case BPF_S_LDX_B_MSH:
 			ptr = load_pointer(skb, fentry->k, 1, &tmp);
 			if (ptr != NULL) {
 				X = (*(u8 *)ptr & 0xf) << 2;
 				continue;
 			}
 			return 0;
-		case BPF_LD|BPF_IMM:
+		case BPF_S_LD_IMM:
 			A = fentry->k;
 			continue;
-		case BPF_LDX|BPF_IMM:
+		case BPF_S_LDX_IMM:
 			X = fentry->k;
 			continue;
-		case BPF_LD|BPF_MEM:
+		case BPF_S_LD_MEM:
 			A = mem[fentry->k];
 			continue;
-		case BPF_LDX|BPF_MEM:
+		case BPF_S_LDX_MEM:
 			X = mem[fentry->k];
 			continue;
-		case BPF_MISC|BPF_TAX:
+		case BPF_S_MISC_TAX:
 			X = A;
 			continue;
-		case BPF_MISC|BPF_TXA:
+		case BPF_S_MISC_TXA:
 			A = X;
 			continue;
-		case BPF_RET|BPF_K:
+		case BPF_S_RET_K:
 			return fentry->k;
-		case BPF_RET|BPF_A:
+		case BPF_S_RET_A:
 			return A;
-		case BPF_ST:
+		case BPF_S_ST:
 			mem[fentry->k] = A;
 			continue;
-		case BPF_STX:
+		case BPF_S_STX:
 			mem[fentry->k] = X;
 			continue;
 		default:
@@ -390,53 +390,128 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
 		/* Only allow valid instructions */
 		switch (ftest->code) {
 		case BPF_ALU|BPF_ADD|BPF_K:
+			ftest->code = BPF_S_ALU_ADD_K;
+			break;
 		case BPF_ALU|BPF_ADD|BPF_X:
+			ftest->code = BPF_S_ALU_ADD_X;
+			break;
 		case BPF_ALU|BPF_SUB|BPF_K:
+			ftest->code = BPF_S_ALU_SUB_K;
+			break;
 		case BPF_ALU|BPF_SUB|BPF_X:
+			ftest->code = BPF_S_ALU_SUB_X;
+			break;
 		case BPF_ALU|BPF_MUL|BPF_K:
+			ftest->code = BPF_S_ALU_MUL_K;
+			break;
 		case BPF_ALU|BPF_MUL|BPF_X:
+			ftest->code = BPF_S_ALU_MUL_X;
+			break;
 		case BPF_ALU|BPF_DIV|BPF_X:
+			ftest->code = BPF_S_ALU_DIV_X;
+			break;
 		case BPF_ALU|BPF_AND|BPF_K:
+			ftest->code = BPF_S_ALU_AND_K;
+			break;
 		case BPF_ALU|BPF_AND|BPF_X:
+			ftest->code = BPF_S_ALU_AND_X;
+			break;
 		case BPF_ALU|BPF_OR|BPF_K:
+			ftest->code = BPF_S_ALU_OR_K;
+			break;
 		case BPF_ALU|BPF_OR|BPF_X:
+			ftest->code = BPF_S_ALU_OR_X;
+			break;
 		case BPF_ALU|BPF_LSH|BPF_K:
+			ftest->code = BPF_S_ALU_LSH_K;
+			break;
 		case BPF_ALU|BPF_LSH|BPF_X:
+			ftest->code = BPF_S_ALU_LSH_X;
+			break;
 		case BPF_ALU|BPF_RSH|BPF_K:
+			ftest->code = BPF_S_ALU_RSH_K;
+			break;
 		case BPF_ALU|BPF_RSH|BPF_X:
+			ftest->code = BPF_S_ALU_RSH_X;
+			break;
 		case BPF_ALU|BPF_NEG:
+			ftest->code = BPF_S_ALU_NEG;
+			break;
 		case BPF_LD|BPF_W|BPF_ABS:
+			ftest->code = BPF_S_LD_W_ABS;
+			break;
 		case BPF_LD|BPF_H|BPF_ABS:
+			ftest->code = BPF_S_LD_H_ABS;
+			break;
 		case BPF_LD|BPF_B|BPF_ABS:
+			ftest->code = BPF_S_LD_B_ABS;
+			break;
 		case BPF_LD|BPF_W|BPF_LEN:
+			ftest->code = BPF_S_LD_W_LEN;
+			break;
 		case BPF_LD|BPF_W|BPF_IND:
+			ftest->code = BPF_S_LD_W_IND;
+			break;
 		case BPF_LD|BPF_H|BPF_IND:
+			ftest->code = BPF_S_LD_H_IND;
+			break;
 		case BPF_LD|BPF_B|BPF_IND:
+			ftest->code = BPF_S_LD_B_IND;
+			break;
 		case BPF_LD|BPF_IMM:
+			ftest->code = BPF_S_LD_IMM;
+			break;
 		case BPF_LDX|BPF_W|BPF_LEN:
+			ftest->code = BPF_S_LDX_W_LEN;
+			break;
 		case BPF_LDX|BPF_B|BPF_MSH:
+			ftest->code = BPF_S_LDX_B_MSH;
+			break;
 		case BPF_LDX|BPF_IMM:
+			ftest->code = BPF_S_LDX_IMM;
+			break;
 		case BPF_MISC|BPF_TAX:
+			ftest->code = BPF_S_MISC_TAX;
+			break;
 		case BPF_MISC|BPF_TXA:
+			ftest->code = BPF_S_MISC_TXA;
+			break;
 		case BPF_RET|BPF_K:
+			ftest->code = BPF_S_RET_K;
+			break;
 		case BPF_RET|BPF_A:
+			ftest->code = BPF_S_RET_A;
 			break;
 
 		/* Some instructions need special checks */
 
-		case BPF_ALU|BPF_DIV|BPF_K:
 			/* check for division by zero */
+		case BPF_ALU|BPF_DIV|BPF_K:
 			if (ftest->k == 0)
 				return -EINVAL;
+			ftest->code = BPF_S_ALU_DIV_K;
 			break;
 
+		/* check for invalid memory addresses */
 		case BPF_LD|BPF_MEM:
+			if (ftest->k >= BPF_MEMWORDS)
+				return -EINVAL;
+			ftest->code = BPF_S_LD_MEM;
+			break;
 		case BPF_LDX|BPF_MEM:
+			if (ftest->k >= BPF_MEMWORDS)
+				return -EINVAL;
+			ftest->code = BPF_S_LDX_MEM;
+			break;
 		case BPF_ST:
+			if (ftest->k >= BPF_MEMWORDS)
+				return -EINVAL;
+			ftest->code = BPF_S_ST;
+			break;
 		case BPF_STX:
-			/* check for invalid memory addresses */
 			if (ftest->k >= BPF_MEMWORDS)
 				return -EINVAL;
+			ftest->code = BPF_S_STX;
 			break;
 
 		case BPF_JMP|BPF_JA:
@@ -447,28 +522,63 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
 			 */
 			if (ftest->k >= (unsigned)(flen-pc-1))
 				return -EINVAL;
+			ftest->code = BPF_S_JMP_JA;
 			break;
 
 		case BPF_JMP|BPF_JEQ|BPF_K:
+			ftest->code = BPF_S_JMP_JEQ_K;
+			break;
 		case BPF_JMP|BPF_JEQ|BPF_X:
+			ftest->code = BPF_S_JMP_JEQ_X;
+			break;
 		case BPF_JMP|BPF_JGE|BPF_K:
+			ftest->code = BPF_S_JMP_JGE_K;
+			break;
 		case BPF_JMP|BPF_JGE|BPF_X:
+			ftest->code = BPF_S_JMP_JGE_X;
+			break;
 		case BPF_JMP|BPF_JGT|BPF_K:
+			ftest->code = BPF_S_JMP_JGT_K;
+			break;
 		case BPF_JMP|BPF_JGT|BPF_X:
+			ftest->code = BPF_S_JMP_JGT_X;
+			break;
 		case BPF_JMP|BPF_JSET|BPF_K:
+			ftest->code = BPF_S_JMP_JSET_K;
+			break;
 		case BPF_JMP|BPF_JSET|BPF_X:
+			ftest->code = BPF_S_JMP_JSET_X;
+			break;
+
+		default:
+			return -EINVAL;
+		}
+
 			/* for conditionals both must be safe */
+		switch (ftest->code) {
+		case BPF_S_JMP_JEQ_K:
+		case BPF_S_JMP_JEQ_X:
+		case BPF_S_JMP_JGE_K:
+		case BPF_S_JMP_JGE_X:
+		case BPF_S_JMP_JGT_K:
+		case BPF_S_JMP_JGT_X:
+		case BPF_S_JMP_JSET_X:
+		case BPF_S_JMP_JSET_K:
 			if (pc + ftest->jt + 1 >= flen ||
 			    pc + ftest->jf + 1 >= flen)
 				return -EINVAL;
-			break;
+		}
+	}
 
+	/* last instruction must be a RET code */
+	switch (filter[flen - 1].code) {
+	case BPF_S_RET_K:
+	case BPF_S_RET_A:
+		return 0;
+		break;
 		default:
 			return -EINVAL;
 		}
-	}
-
-	return (BPF_CLASS(filter[flen - 1].code) == BPF_RET) ? 0 : -EINVAL;
 }
 EXPORT_SYMBOL(sk_chk_filter);
 
@@ -528,10 +638,9 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 		return err;
 	}
 
-	rcu_read_lock_bh();
-	old_fp = rcu_dereference_bh(sk->sk_filter);
+	old_fp = rcu_dereference_protected(sk->sk_filter,
+					   sock_owned_by_user(sk));
 	rcu_assign_pointer(sk->sk_filter, fp);
-	rcu_read_unlock_bh();
 
 	if (old_fp)
 		sk_filter_delayed_uncharge(sk, old_fp);
@@ -544,14 +653,13 @@ int sk_detach_filter(struct sock *sk)
 	int ret = -ENOENT;
 	struct sk_filter *filter;
 
-	rcu_read_lock_bh();
-	filter = rcu_dereference_bh(sk->sk_filter);
+	filter = rcu_dereference_protected(sk->sk_filter,
+					   sock_owned_by_user(sk));
 	if (filter) {
 		rcu_assign_pointer(sk->sk_filter, NULL);
 		sk_filter_delayed_uncharge(sk, filter);
 		ret = 0;
 	}
-	rcu_read_unlock_bh();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(sk_detach_filter);
diff --git a/net/core/flow.c b/net/core/flow.c
index 16190067400..127c8a7ffd6 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -53,8 +53,7 @@ struct flow_flush_info {
 
 struct flow_cache {
 	u32				hash_shift;
-	unsigned long			order;
-	struct flow_cache_percpu	*percpu;
+	struct flow_cache_percpu __percpu *percpu;
 	struct notifier_block		hotcpu_notifier;
 	int				low_watermark;
 	int				high_watermark;
@@ -62,8 +61,9 @@ struct flow_cache {
 };
 
 atomic_t flow_cache_genid = ATOMIC_INIT(0);
+EXPORT_SYMBOL(flow_cache_genid);
 static struct flow_cache flow_cache_global;
-static struct kmem_cache *flow_cachep;
+static struct kmem_cache *flow_cachep __read_mostly;
 
 static DEFINE_SPINLOCK(flow_cache_gc_lock);
 static LIST_HEAD(flow_cache_gc_list);
@@ -176,15 +176,11 @@ static u32 flow_hash_code(struct flow_cache *fc,
 {
 	u32 *k = (u32 *) key;
 
-	return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
-		& (flow_cache_hash_size(fc) - 1));
+	return jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
+		& (flow_cache_hash_size(fc) - 1);
 }
 
-#if (BITS_PER_LONG == 64)
-typedef u64 flow_compare_t;
-#else
-typedef u32 flow_compare_t;
-#endif
+typedef unsigned long flow_compare_t;
 
 /* I hear what you're saying, use memcmp.  But memcmp cannot make
  * important assumptions that we can here, such as alignment and
@@ -222,7 +218,7 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 	unsigned int hash;
 
 	local_bh_disable();
-	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
+	fcp = this_cpu_ptr(fc->percpu);
 
 	fle = NULL;
 	flo = NULL;
@@ -291,6 +287,7 @@ ret_object:
 	local_bh_enable();
 	return flo;
 }
+EXPORT_SYMBOL(flow_cache_lookup);
 
 static void flow_cache_flush_tasklet(unsigned long data)
 {
@@ -302,7 +299,7 @@ static void flow_cache_flush_tasklet(unsigned long data)
 	LIST_HEAD(gc_list);
 	int i, deleted = 0;
 
-	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
+	fcp = this_cpu_ptr(fc->percpu);
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		hlist_for_each_entry_safe(fle, entry, tmp,
 					  &fcp->hash_table[i], u.hlist) {
@@ -355,62 +352,73 @@ void flow_cache_flush(void)
 	put_online_cpus();
 }
 
-static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
-					  struct flow_cache_percpu *fcp)
+static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
 {
-	fcp->hash_table = (struct hlist_head *)
-		__get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order);
-	if (!fcp->hash_table)
-		panic("NET: failed to allocate flow cache order %lu\n", fc->order);
+	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
+	size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
 
-	fcp->hash_rnd_recalc = 1;
-	fcp->hash_count = 0;
-	tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
+	if (!fcp->hash_table) {
+		fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
+		if (!fcp->hash_table) {
+			pr_err("NET: failed to allocate flow cache sz %zu\n", sz);
+			return -ENOMEM;
+		}
+		fcp->hash_rnd_recalc = 1;
+		fcp->hash_count = 0;
+		tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
+	}
+	return 0;
 }
 
-static int flow_cache_cpu(struct notifier_block *nfb,
+static int __cpuinit flow_cache_cpu(struct notifier_block *nfb,
 			  unsigned long action,
 			  void *hcpu)
 {
 	struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
-	int cpu = (unsigned long) hcpu;
+	int res, cpu = (unsigned long) hcpu;
 	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
 
-	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		res = flow_cache_cpu_prepare(fc, cpu);
+		if (res)
+			return notifier_from_errno(res);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
 		__flow_cache_shrink(fc, fcp, 0);
+		break;
+	}
 	return NOTIFY_OK;
 }
 
-static int flow_cache_init(struct flow_cache *fc)
+static int __init flow_cache_init(struct flow_cache *fc)
 {
-	unsigned long order;
 	int i;
 
 	fc->hash_shift = 10;
 	fc->low_watermark = 2 * flow_cache_hash_size(fc);
 	fc->high_watermark = 4 * flow_cache_hash_size(fc);
 
-	for (order = 0;
-	     (PAGE_SIZE << order) <
-		     (sizeof(struct hlist_head)*flow_cache_hash_size(fc));
-	     order++)
-		/* NOTHING */;
-	fc->order = order;
 	fc->percpu = alloc_percpu(struct flow_cache_percpu);
+	if (!fc->percpu)
+		return -ENOMEM;
 
-	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
-		    (unsigned long) fc);
-	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
-	add_timer(&fc->rnd_timer);
-
-	for_each_possible_cpu(i)
-		flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i));
-
+	for_each_online_cpu(i) {
+		if (flow_cache_cpu_prepare(fc, i))
+			return -ENOMEM;
+	}
 	fc->hotcpu_notifier = (struct notifier_block){
 		.notifier_call = flow_cache_cpu,
 	};
 	register_hotcpu_notifier(&fc->hotcpu_notifier);
 
+	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
+		    (unsigned long) fc);
+	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+	add_timer(&fc->rnd_timer);
+
 	return 0;
 }
 
@@ -424,6 +432,3 @@ static int __init flow_cache_init_global(void)
 }
 
 module_init(flow_cache_init_global);
-
-EXPORT_SYMBOL(flow_cache_genid);
-EXPORT_SYMBOL(flow_cache_lookup);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 785e5276a30..7c2373321b7 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -232,7 +232,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 	est->last_packets = bstats->packets;
 	est->avpps = rate_est->pps<<10;
 
-	spin_lock(&est_tree_lock);
+	spin_lock_bh(&est_tree_lock);
 	if (!elist[idx].timer.function) {
 		INIT_LIST_HEAD(&elist[idx].list);
 		setup_timer(&elist[idx].timer, est_timer, idx);
@@ -243,7 +243,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 
 	list_add_rcu(&est->list, &elist[idx].list);
 	gen_add_node(est);
-	spin_unlock(&est_tree_lock);
+	spin_unlock_bh(&est_tree_lock);
 
 	return 0;
 }
@@ -263,24 +263,25 @@ static void __gen_kill_estimator(struct rcu_head *head)
  *
  * Removes the rate estimator specified by &bstats and &rate_est.
  *
+ * Note : Caller should respect an RCU grace period before freeing stats_lock
  */
 void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
 			struct gnet_stats_rate_est *rate_est)
 {
 	struct gen_estimator *e;
 
-	spin_lock(&est_tree_lock);
+	spin_lock_bh(&est_tree_lock);
 	while ((e = gen_find_node(bstats, rate_est))) {
 		rb_erase(&e->node, &est_root);
 
-		write_lock_bh(&est_lock);
+		write_lock(&est_lock);
 		e->bstats = NULL;
-		write_unlock_bh(&est_lock);
+		write_unlock(&est_lock);
 
 		list_del_rcu(&e->list);
 		call_rcu(&e->e_rcu, __gen_kill_estimator);
 	}
-	spin_unlock(&est_tree_lock);
+	spin_unlock_bh(&est_tree_lock);
 }
 EXPORT_SYMBOL(gen_kill_estimator);
 
@@ -319,9 +320,9 @@ bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
 
 	ASSERT_RTNL();
 
-	spin_lock(&est_tree_lock);
+	spin_lock_bh(&est_tree_lock);
 	res = gen_find_node(bstats, rate_est) != NULL;
-	spin_unlock(&est_tree_lock);
+	spin_unlock_bh(&est_tree_lock);
 
 	return res;
 }
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 393b1d8618e..0452eb27a27 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -73,6 +73,7 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
 
 	return 0;
 }
+EXPORT_SYMBOL(gnet_stats_start_copy_compat);
 
 /**
  * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
@@ -93,6 +94,7 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
 {
 	return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d);
 }
+EXPORT_SYMBOL(gnet_stats_start_copy);
 
 /**
  * gnet_stats_copy_basic - copy basic statistics into statistic TLV
@@ -123,6 +125,7 @@ gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b)
 	}
 	return 0;
 }
+EXPORT_SYMBOL(gnet_stats_copy_basic);
 
 /**
  * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
@@ -154,6 +157,7 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
 
 	return 0;
 }
+EXPORT_SYMBOL(gnet_stats_copy_rate_est);
 
 /**
  * gnet_stats_copy_queue - copy queue statistics into statistics TLV
@@ -181,6 +185,7 @@ gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q)
 
 	return 0;
 }
+EXPORT_SYMBOL(gnet_stats_copy_queue);
 
 /**
  * gnet_stats_copy_app - copy application specific statistics into statistics TLV
@@ -208,6 +213,7 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
 
 	return 0;
 }
+EXPORT_SYMBOL(gnet_stats_copy_app);
 
 /**
  * gnet_stats_finish_copy - finish dumping procedure
@@ -241,12 +247,4 @@ gnet_stats_finish_copy(struct gnet_dump *d)
 	spin_unlock_bh(d->lock);
 	return 0;
 }
-
-
-EXPORT_SYMBOL(gnet_stats_start_copy);
-EXPORT_SYMBOL(gnet_stats_start_copy_compat);
-EXPORT_SYMBOL(gnet_stats_copy_basic);
-EXPORT_SYMBOL(gnet_stats_copy_rate_est);
-EXPORT_SYMBOL(gnet_stats_copy_queue);
-EXPORT_SYMBOL(gnet_stats_copy_app);
 EXPORT_SYMBOL(gnet_stats_finish_copy);
diff --git a/net/core/iovec.c b/net/core/iovec.c
index 1e7f4e91a93..c40f27e7d20 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -37,11 +37,13 @@
 
 int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, int mode)
 {
-	int size, err, ct;
+	int size, ct, err;
 
 	if (m->msg_namelen) {
 		if (mode == VERIFY_READ) {
-			err = move_addr_to_kernel(m->msg_name, m->msg_namelen,
+			void __user *namep;
+			namep = (void __user __force *) m->msg_name;
+			err = move_addr_to_kernel(namep, m->msg_namelen,
 						  address);
 			if (err < 0)
 				return err;
@@ -52,21 +54,20 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address,
 	}
 
 	size = m->msg_iovlen * sizeof(struct iovec);
-	if (copy_from_user(iov, m->msg_iov, size))
+	if (copy_from_user(iov, (void __user __force *) m->msg_iov, size))
 		return -EFAULT;
 
 	m->msg_iov = iov;
 	err = 0;
 
 	for (ct = 0; ct < m->msg_iovlen; ct++) {
-		err += iov[ct].iov_len;
-		/*
-		 * Goal is not to verify user data, but to prevent returning
-		 * negative value, which is interpreted as errno.
-		 * Overflow is still possible, but it is harmless.
-		 */
-		if (err < 0)
-			return -EMSGSIZE;
+		size_t len = iov[ct].iov_len;
+
+		if (len > INT_MAX - err) {
+			len = INT_MAX - err;
+			iov[ct].iov_len = len;
+		}
+		err += len;
 	}
 
 	return err;
@@ -95,6 +96,7 @@ int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
 
 	return 0;
 }
+EXPORT_SYMBOL(memcpy_toiovec);
 
 /*
  *	Copy kernel to iovec. Returns -EFAULT on error.
@@ -120,6 +122,7 @@ int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
 
 	return 0;
 }
+EXPORT_SYMBOL(memcpy_toiovecend);
 
 /*
  *	Copy iovec to kernel. Returns -EFAULT on error.
@@ -144,6 +147,7 @@ int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
 
 	return 0;
 }
+EXPORT_SYMBOL(memcpy_fromiovec);
 
 /*
  *	Copy iovec from kernel. Returns -EFAULT on error.
@@ -172,6 +176,7 @@ int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
 
 	return 0;
 }
+EXPORT_SYMBOL(memcpy_fromiovecend);
 
 /*
  *	And now for the all-in-one: copy and checksum from a user iovec
@@ -256,9 +261,4 @@ out_fault:
 	err = -EFAULT;
 	goto out;
 }
-
 EXPORT_SYMBOL(csum_partial_copy_fromiovecend);
-EXPORT_SYMBOL(memcpy_fromiovec);
-EXPORT_SYMBOL(memcpy_fromiovecend);
-EXPORT_SYMBOL(memcpy_toiovec);
-EXPORT_SYMBOL(memcpy_toiovecend);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index bdbce2f5875..01a1101b593 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -243,5 +243,4 @@ void linkwatch_fire_event(struct net_device *dev)
 
 	linkwatch_schedule_work(urgent);
 }
-
 EXPORT_SYMBOL(linkwatch_fire_event);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index a4e0a7482c2..8cc8f9a79db 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -122,7 +122,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)
 
 unsigned long neigh_rand_reach_time(unsigned long base)
 {
-	return (base ? (net_random() % base) + (base >> 1) : 0);
+	return base ? (net_random() % base) + (base >> 1) : 0;
 }
 EXPORT_SYMBOL(neigh_rand_reach_time);
 
@@ -131,15 +131,20 @@ static int neigh_forced_gc(struct neigh_table *tbl)
 {
 	int shrunk = 0;
 	int i;
+	struct neigh_hash_table *nht;
 
 	NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
 
 	write_lock_bh(&tbl->lock);
-	for (i = 0; i <= tbl->hash_mask; i++) {
-		struct neighbour *n, **np;
+	nht = rcu_dereference_protected(tbl->nht,
+					lockdep_is_held(&tbl->lock));
+	for (i = 0; i <= nht->hash_mask; i++) {
+		struct neighbour *n;
+		struct neighbour __rcu **np;
 
-		np = &tbl->hash_buckets[i];
-		while ((n = *np) != NULL) {
+		np = &nht->hash_buckets[i];
+		while ((n = rcu_dereference_protected(*np,
+					lockdep_is_held(&tbl->lock))) != NULL) {
 			/* Neighbour record may be discarded if:
 			 * - nobody refers to it.
 			 * - it is not permanent
@@ -147,7 +152,9 @@ static int neigh_forced_gc(struct neigh_table *tbl)
 			write_lock(&n->lock);
 			if (atomic_read(&n->refcnt) == 1 &&
 			    !(n->nud_state & NUD_PERMANENT)) {
-				*np	= n->next;
+				rcu_assign_pointer(*np,
+					rcu_dereference_protected(n->next,
+						  lockdep_is_held(&tbl->lock)));
 				n->dead = 1;
 				shrunk	= 1;
 				write_unlock(&n->lock);
@@ -199,16 +206,24 @@ static void pneigh_queue_purge(struct sk_buff_head *list)
 static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
 {
 	int i;
+	struct neigh_hash_table *nht;
 
-	for (i = 0; i <= tbl->hash_mask; i++) {
-		struct neighbour *n, **np = &tbl->hash_buckets[i];
+	nht = rcu_dereference_protected(tbl->nht,
+					lockdep_is_held(&tbl->lock));
 
-		while ((n = *np) != NULL) {
+	for (i = 0; i <= nht->hash_mask; i++) {
+		struct neighbour *n;
+		struct neighbour __rcu **np = &nht->hash_buckets[i];
+
+		while ((n = rcu_dereference_protected(*np,
+					lockdep_is_held(&tbl->lock))) != NULL) {
 			if (dev && n->dev != dev) {
 				np = &n->next;
 				continue;
 			}
-			*np = n->next;
+			rcu_assign_pointer(*np,
+				   rcu_dereference_protected(n->next,
+						lockdep_is_held(&tbl->lock)));
 			write_lock(&n->lock);
 			neigh_del_timer(n);
 			n->dead = 1;
@@ -279,6 +294,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
 
 	skb_queue_head_init(&n->arp_queue);
 	rwlock_init(&n->lock);
+	seqlock_init(&n->ha_lock);
 	n->updated	  = n->used = now;
 	n->nud_state	  = NUD_NONE;
 	n->output	  = neigh_blackhole;
@@ -297,64 +313,86 @@ out_entries:
 	goto out;
 }
 
-static struct neighbour **neigh_hash_alloc(unsigned int entries)
+static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)
 {
-	unsigned long size = entries * sizeof(struct neighbour *);
-	struct neighbour **ret;
+	size_t size = entries * sizeof(struct neighbour *);
+	struct neigh_hash_table *ret;
+	struct neighbour **buckets;
 
-	if (size <= PAGE_SIZE) {
-		ret = kzalloc(size, GFP_ATOMIC);
-	} else {
-		ret = (struct neighbour **)
-		      __get_free_pages(GFP_ATOMIC|__GFP_ZERO, get_order(size));
+	ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
+	if (!ret)
+		return NULL;
+	if (size <= PAGE_SIZE)
+		buckets = kzalloc(size, GFP_ATOMIC);
+	else
+		buckets = (struct neighbour **)
+			  __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
+					   get_order(size));
+	if (!buckets) {
+		kfree(ret);
+		return NULL;
 	}
+	rcu_assign_pointer(ret->hash_buckets, buckets);
+	ret->hash_mask = entries - 1;
+	get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd));
 	return ret;
 }
 
-static void neigh_hash_free(struct neighbour **hash, unsigned int entries)
+static void neigh_hash_free_rcu(struct rcu_head *head)
 {
-	unsigned long size = entries * sizeof(struct neighbour *);
+	struct neigh_hash_table *nht = container_of(head,
+						    struct neigh_hash_table,
+						    rcu);
+	size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *);
+	struct neighbour **buckets = nht->hash_buckets;
 
 	if (size <= PAGE_SIZE)
-		kfree(hash);
+		kfree(buckets);
 	else
-		free_pages((unsigned long)hash, get_order(size));
+		free_pages((unsigned long)buckets, get_order(size));
+	kfree(nht);
 }
 
-static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries)
+static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
+						unsigned long new_entries)
 {
-	struct neighbour **new_hash, **old_hash;
-	unsigned int i, new_hash_mask, old_entries;
+	unsigned int i, hash;
+	struct neigh_hash_table *new_nht, *old_nht;
 
 	NEIGH_CACHE_STAT_INC(tbl, hash_grows);
 
 	BUG_ON(!is_power_of_2(new_entries));
-	new_hash = neigh_hash_alloc(new_entries);
-	if (!new_hash)
-		return;
-
-	old_entries = tbl->hash_mask + 1;
-	new_hash_mask = new_entries - 1;
-	old_hash = tbl->hash_buckets;
+	old_nht = rcu_dereference_protected(tbl->nht,
+					    lockdep_is_held(&tbl->lock));
+	new_nht = neigh_hash_alloc(new_entries);
+	if (!new_nht)
+		return old_nht;
 
-	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
-	for (i = 0; i < old_entries; i++) {
+	for (i = 0; i <= old_nht->hash_mask; i++) {
 		struct neighbour *n, *next;
 
-		for (n = old_hash[i]; n; n = next) {
-			unsigned int hash_val = tbl->hash(n->primary_key, n->dev);
-
-			hash_val &= new_hash_mask;
-			next = n->next;
-
-			n->next = new_hash[hash_val];
-			new_hash[hash_val] = n;
+		for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
+						   lockdep_is_held(&tbl->lock));
+		     n != NULL;
+		     n = next) {
+			hash = tbl->hash(n->primary_key, n->dev,
+					 new_nht->hash_rnd);
+
+			hash &= new_nht->hash_mask;
+			next = rcu_dereference_protected(n->next,
+						lockdep_is_held(&tbl->lock));
+
+			rcu_assign_pointer(n->next,
+					   rcu_dereference_protected(
+						new_nht->hash_buckets[hash],
+						lockdep_is_held(&tbl->lock)));
+			rcu_assign_pointer(new_nht->hash_buckets[hash], n);
 		}
 	}
-	tbl->hash_buckets = new_hash;
-	tbl->hash_mask = new_hash_mask;
 
-	neigh_hash_free(old_hash, old_entries);
+	rcu_assign_pointer(tbl->nht, new_nht);
+	call_rcu(&old_nht->rcu, neigh_hash_free_rcu);
+	return new_nht;
 }
 
 struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
@@ -363,19 +401,26 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
 	struct neighbour *n;
 	int key_len = tbl->key_len;
 	u32 hash_val;
+	struct neigh_hash_table *nht;
 
 	NEIGH_CACHE_STAT_INC(tbl, lookups);
 
-	read_lock_bh(&tbl->lock);
-	hash_val = tbl->hash(pkey, dev);
-	for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) {
+	rcu_read_lock_bh();
+	nht = rcu_dereference_bh(tbl->nht);
+	hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask;
+
+	for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
+	     n != NULL;
+	     n = rcu_dereference_bh(n->next)) {
 		if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
-			neigh_hold(n);
+			if (!atomic_inc_not_zero(&n->refcnt))
+				n = NULL;
 			NEIGH_CACHE_STAT_INC(tbl, hits);
 			break;
 		}
 	}
-	read_unlock_bh(&tbl->lock);
+
+	rcu_read_unlock_bh();
 	return n;
 }
 EXPORT_SYMBOL(neigh_lookup);
@@ -386,20 +431,27 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
 	struct neighbour *n;
 	int key_len = tbl->key_len;
 	u32 hash_val;
+	struct neigh_hash_table *nht;
 
 	NEIGH_CACHE_STAT_INC(tbl, lookups);
 
-	read_lock_bh(&tbl->lock);
-	hash_val = tbl->hash(pkey, NULL);
-	for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) {
+	rcu_read_lock_bh();
+	nht = rcu_dereference_bh(tbl->nht);
+	hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask;
+
+	for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
+	     n != NULL;
+	     n = rcu_dereference_bh(n->next)) {
 		if (!memcmp(n->primary_key, pkey, key_len) &&
 		    net_eq(dev_net(n->dev), net)) {
-			neigh_hold(n);
+			if (!atomic_inc_not_zero(&n->refcnt))
+				n = NULL;
 			NEIGH_CACHE_STAT_INC(tbl, hits);
 			break;
 		}
 	}
-	read_unlock_bh(&tbl->lock);
+
+	rcu_read_unlock_bh();
 	return n;
 }
 EXPORT_SYMBOL(neigh_lookup_nodev);
@@ -411,6 +463,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
 	int key_len = tbl->key_len;
 	int error;
 	struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
+	struct neigh_hash_table *nht;
 
 	if (!n) {
 		rc = ERR_PTR(-ENOBUFS);
@@ -437,18 +490,24 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
 	n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
 
 	write_lock_bh(&tbl->lock);
+	nht = rcu_dereference_protected(tbl->nht,
+					lockdep_is_held(&tbl->lock));
 
-	if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1))
-		neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
+	if (atomic_read(&tbl->entries) > (nht->hash_mask + 1))
+		nht = neigh_hash_grow(tbl, (nht->hash_mask + 1) << 1);
 
-	hash_val = tbl->hash(pkey, dev) & tbl->hash_mask;
+	hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask;
 
 	if (n->parms->dead) {
 		rc = ERR_PTR(-EINVAL);
 		goto out_tbl_unlock;
 	}
 
-	for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) {
+	for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
+					    lockdep_is_held(&tbl->lock));
+	     n1 != NULL;
+	     n1 = rcu_dereference_protected(n1->next,
+			lockdep_is_held(&tbl->lock))) {
 		if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
 			neigh_hold(n1);
 			rc = n1;
@@ -456,10 +515,12 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
 		}
 	}
 
-	n->next = tbl->hash_buckets[hash_val];
-	tbl->hash_buckets[hash_val] = n;
 	n->dead = 0;
 	neigh_hold(n);
+	rcu_assign_pointer(n->next,
+			   rcu_dereference_protected(nht->hash_buckets[hash_val],
+						     lockdep_is_held(&tbl->lock)));
+	rcu_assign_pointer(nht->hash_buckets[hash_val], n);
 	write_unlock_bh(&tbl->lock);
 	NEIGH_PRINTK2("neigh %p is created.\n", n);
 	rc = n;
@@ -616,6 +677,12 @@ static inline void neigh_parms_put(struct neigh_parms *parms)
 		neigh_parms_destroy(parms);
 }
 
+static void neigh_destroy_rcu(struct rcu_head *head)
+{
+	struct neighbour *neigh = container_of(head, struct neighbour, rcu);
+
+	kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
+}
 /*
  *	neighbour must already be out of the table;
  *
@@ -643,8 +710,7 @@ void neigh_destroy(struct neighbour *neigh)
 		write_seqlock_bh(&hh->hh_lock);
 		hh->hh_output = neigh_blackhole;
 		write_sequnlock_bh(&hh->hh_lock);
-		if (atomic_dec_and_test(&hh->hh_refcnt))
-			kfree(hh);
+		hh_cache_put(hh);
 	}
 
 	skb_queue_purge(&neigh->arp_queue);
@@ -655,7 +721,7 @@ void neigh_destroy(struct neighbour *neigh)
 	NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
 
 	atomic_dec(&neigh->tbl->entries);
-	kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
+	call_rcu(&neigh->rcu, neigh_destroy_rcu);
 }
 EXPORT_SYMBOL(neigh_destroy);
 
@@ -696,12 +762,16 @@ static void neigh_connect(struct neighbour *neigh)
 static void neigh_periodic_work(struct work_struct *work)
 {
 	struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
-	struct neighbour *n, **np;
+	struct neighbour *n;
+	struct neighbour __rcu **np;
 	unsigned int i;
+	struct neigh_hash_table *nht;
 
 	NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
 
 	write_lock_bh(&tbl->lock);
+	nht = rcu_dereference_protected(tbl->nht,
+					lockdep_is_held(&tbl->lock));
 
 	/*
 	 *	periodically recompute ReachableTime from random function
@@ -715,10 +785,11 @@ static void neigh_periodic_work(struct work_struct *work)
 				neigh_rand_reach_time(p->base_reachable_time);
 	}
 
-	for (i = 0 ; i <= tbl->hash_mask; i++) {
-		np = &tbl->hash_buckets[i];
+	for (i = 0 ; i <= nht->hash_mask; i++) {
+		np = &nht->hash_buckets[i];
 
-		while ((n = *np) != NULL) {
+		while ((n = rcu_dereference_protected(*np,
+				lockdep_is_held(&tbl->lock))) != NULL) {
 			unsigned int state;
 
 			write_lock(&n->lock);
@@ -766,9 +837,9 @@ next_elt:
 static __inline__ int neigh_max_probes(struct neighbour *n)
 {
 	struct neigh_parms *p = n->parms;
-	return (n->nud_state & NUD_PROBE ?
+	return (n->nud_state & NUD_PROBE) ?
 		p->ucast_probes :
-		p->ucast_probes + p->app_probes + p->mcast_probes);
+		p->ucast_probes + p->app_probes + p->mcast_probes;
 }
 
 static void neigh_invalidate(struct neighbour *neigh)
@@ -945,7 +1016,7 @@ out_unlock_bh:
 }
 EXPORT_SYMBOL(__neigh_event_send);
 
-static void neigh_update_hhs(struct neighbour *neigh)
+static void neigh_update_hhs(const struct neighbour *neigh)
 {
 	struct hh_cache *hh;
 	void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
@@ -1081,7 +1152,9 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
 	}
 
 	if (lladdr != neigh->ha) {
+		write_seqlock(&neigh->ha_lock);
 		memcpy(&neigh->ha, lladdr, dev->addr_len);
+		write_sequnlock(&neigh->ha_lock);
 		neigh_update_hhs(neigh);
 		if (!(new & NUD_CONNECTED))
 			neigh->confirmed = jiffies -
@@ -1139,44 +1212,73 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl,
 }
 EXPORT_SYMBOL(neigh_event_ns);
 
+static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst,
+				   __be16 protocol)
+{
+	struct hh_cache *hh;
+
+	smp_rmb(); /* paired with smp_wmb() in neigh_hh_init() */
+	for (hh = n->hh; hh; hh = hh->hh_next) {
+		if (hh->hh_type == protocol) {
+			atomic_inc(&hh->hh_refcnt);
+			if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
+				hh_cache_put(hh);
+			return true;
+		}
+	}
+	return false;
+}
+
+/* called with read_lock_bh(&n->lock); */
 static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
 			  __be16 protocol)
 {
 	struct hh_cache	*hh;
 	struct net_device *dev = dst->dev;
 
-	for (hh = n->hh; hh; hh = hh->hh_next)
-		if (hh->hh_type == protocol)
-			break;
+	if (likely(neigh_hh_lookup(n, dst, protocol)))
+		return;
 
-	if (!hh && (hh = kzalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) {
-		seqlock_init(&hh->hh_lock);
-		hh->hh_type = protocol;
-		atomic_set(&hh->hh_refcnt, 0);
-		hh->hh_next = NULL;
+	/* slow path */
+	hh = kzalloc(sizeof(*hh), GFP_ATOMIC);
+	if (!hh)
+		return;
 
-		if (dev->header_ops->cache(n, hh)) {
-			kfree(hh);
-			hh = NULL;
-		} else {
-			atomic_inc(&hh->hh_refcnt);
-			hh->hh_next = n->hh;
-			n->hh	    = hh;
-			if (n->nud_state & NUD_CONNECTED)
-				hh->hh_output = n->ops->hh_output;
-			else
-				hh->hh_output = n->ops->output;
-		}
+	seqlock_init(&hh->hh_lock);
+	hh->hh_type = protocol;
+	atomic_set(&hh->hh_refcnt, 2);
+
+	if (dev->header_ops->cache(n, hh)) {
+		kfree(hh);
+		return;
 	}
-	if (hh)	{
-		atomic_inc(&hh->hh_refcnt);
-		dst->hh = hh;
+
+	write_lock_bh(&n->lock);
+
+	/* must check if another thread already did the insert */
+	if (neigh_hh_lookup(n, dst, protocol)) {
+		kfree(hh);
+		goto end;
 	}
+
+	if (n->nud_state & NUD_CONNECTED)
+		hh->hh_output = n->ops->hh_output;
+	else
+		hh->hh_output = n->ops->output;
+
+	hh->hh_next = n->hh;
+	smp_wmb(); /* paired with smp_rmb() in neigh_hh_lookup() */
+	n->hh	    = hh;
+
+	if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
+		hh_cache_put(hh);
+end:
+	write_unlock_bh(&n->lock);
 }
 
 /* This function can be used in contexts, where only old dev_queue_xmit
-   worked, f.e. if you want to override normal output path (eql, shaper),
-   but resolution is not made yet.
+ * worked, f.e. if you want to override normal output path (eql, shaper),
+ * but resolution is not made yet.
  */
 
 int neigh_compat_output(struct sk_buff *skb)
@@ -1210,19 +1312,19 @@ int neigh_resolve_output(struct sk_buff *skb)
 	if (!neigh_event_send(neigh, skb)) {
 		int err;
 		struct net_device *dev = neigh->dev;
-		if (dev->header_ops->cache && !dst->hh) {
-			write_lock_bh(&neigh->lock);
-			if (!dst->hh)
-				neigh_hh_init(neigh, dst, dst->ops->protocol);
-			err = dev_hard_header(skb, dev, ntohs(skb->protocol),
-					      neigh->ha, NULL, skb->len);
-			write_unlock_bh(&neigh->lock);
-		} else {
-			read_lock_bh(&neigh->lock);
+		unsigned int seq;
+
+		if (dev->header_ops->cache &&
+		    !dst->hh &&
+		    !(dst->flags & DST_NOCACHE))
+			neigh_hh_init(neigh, dst, dst->ops->protocol);
+
+		do {
+			seq = read_seqbegin(&neigh->ha_lock);
 			err = dev_hard_header(skb, dev, ntohs(skb->protocol),
 					      neigh->ha, NULL, skb->len);
-			read_unlock_bh(&neigh->lock);
-		}
+		} while (read_seqretry(&neigh->ha_lock, seq));
+
 		if (err >= 0)
 			rc = neigh->ops->queue_xmit(skb);
 		else
@@ -1248,13 +1350,16 @@ int neigh_connected_output(struct sk_buff *skb)
 	struct dst_entry *dst = skb_dst(skb);
 	struct neighbour *neigh = dst->neighbour;
 	struct net_device *dev = neigh->dev;
+	unsigned int seq;
 
 	__skb_pull(skb, skb_network_offset(skb));
 
-	read_lock_bh(&neigh->lock);
-	err = dev_hard_header(skb, dev, ntohs(skb->protocol),
-			      neigh->ha, NULL, skb->len);
-	read_unlock_bh(&neigh->lock);
+	do {
+		seq = read_seqbegin(&neigh->ha_lock);
+		err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+				      neigh->ha, NULL, skb->len);
+	} while (read_seqretry(&neigh->ha_lock, seq));
+
 	if (err >= 0)
 		err = neigh->ops->queue_xmit(skb);
 	else {
@@ -1436,17 +1541,14 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
 		panic("cannot create neighbour proc dir entry");
 #endif
 
-	tbl->hash_mask = 1;
-	tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1);
+	tbl->nht = neigh_hash_alloc(8);
 
 	phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
 	tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
 
-	if (!tbl->hash_buckets || !tbl->phash_buckets)
+	if (!tbl->nht || !tbl->phash_buckets)
 		panic("cannot allocate neighbour cache hashes");
 
-	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
-
 	rwlock_init(&tbl->lock);
 	INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work);
 	schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time);
@@ -1486,8 +1588,7 @@ int neigh_table_clear(struct neigh_table *tbl)
 	struct neigh_table **tp;
 
 	/* It is not clean... Fix it to unload IPv6 module safely */
-	cancel_delayed_work(&tbl->gc_work);
-	flush_scheduled_work();
+	cancel_delayed_work_sync(&tbl->gc_work);
 	del_timer_sync(&tbl->proxy_timer);
 	pneigh_queue_purge(&tbl->proxy_queue);
 	neigh_ifdown(tbl, NULL);
@@ -1502,8 +1603,8 @@ int neigh_table_clear(struct neigh_table *tbl)
 	}
 	write_unlock(&neigh_tbl_lock);
 
-	neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1);
-	tbl->hash_buckets = NULL;
+	call_rcu(&tbl->nht->rcu, neigh_hash_free_rcu);
+	tbl->nht = NULL;
 
 	kfree(tbl->phash_buckets);
 	tbl->phash_buckets = NULL;
@@ -1529,6 +1630,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct net_device *dev = NULL;
 	int err = -EINVAL;
 
+	ASSERT_RTNL();
 	if (nlmsg_len(nlh) < sizeof(*ndm))
 		goto out;
 
@@ -1538,7 +1640,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 
 	ndm = nlmsg_data(nlh);
 	if (ndm->ndm_ifindex) {
-		dev = dev_get_by_index(net, ndm->ndm_ifindex);
+		dev = __dev_get_by_index(net, ndm->ndm_ifindex);
 		if (dev == NULL) {
 			err = -ENODEV;
 			goto out;
@@ -1554,34 +1656,31 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		read_unlock(&neigh_tbl_lock);
 
 		if (nla_len(dst_attr) < tbl->key_len)
-			goto out_dev_put;
+			goto out;
 
 		if (ndm->ndm_flags & NTF_PROXY) {
 			err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
-			goto out_dev_put;
+			goto out;
 		}
 
 		if (dev == NULL)
-			goto out_dev_put;
+			goto out;
 
 		neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
 		if (neigh == NULL) {
 			err = -ENOENT;
-			goto out_dev_put;
+			goto out;
 		}
 
 		err = neigh_update(neigh, NULL, NUD_FAILED,
 				   NEIGH_UPDATE_F_OVERRIDE |
 				   NEIGH_UPDATE_F_ADMIN);
 		neigh_release(neigh);
-		goto out_dev_put;
+		goto out;
 	}
 	read_unlock(&neigh_tbl_lock);
 	err = -EAFNOSUPPORT;
 
-out_dev_put:
-	if (dev)
-		dev_put(dev);
 out:
 	return err;
 }
@@ -1595,6 +1694,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct net_device *dev = NULL;
 	int err;
 
+	ASSERT_RTNL();
 	err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
 	if (err < 0)
 		goto out;
@@ -1605,14 +1705,14 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 
 	ndm = nlmsg_data(nlh);
 	if (ndm->ndm_ifindex) {
-		dev = dev_get_by_index(net, ndm->ndm_ifindex);
+		dev = __dev_get_by_index(net, ndm->ndm_ifindex);
 		if (dev == NULL) {
 			err = -ENODEV;
 			goto out;
 		}
 
 		if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len)
-			goto out_dev_put;
+			goto out;
 	}
 
 	read_lock(&neigh_tbl_lock);
@@ -1626,7 +1726,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		read_unlock(&neigh_tbl_lock);
 
 		if (nla_len(tb[NDA_DST]) < tbl->key_len)
-			goto out_dev_put;
+			goto out;
 		dst = nla_data(tb[NDA_DST]);
 		lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
 
@@ -1639,29 +1739,29 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 				pn->flags = ndm->ndm_flags;
 				err = 0;
 			}
-			goto out_dev_put;
+			goto out;
 		}
 
 		if (dev == NULL)
-			goto out_dev_put;
+			goto out;
 
 		neigh = neigh_lookup(tbl, dst, dev);
 		if (neigh == NULL) {
 			if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
 				err = -ENOENT;
-				goto out_dev_put;
+				goto out;
 			}
 
 			neigh = __neigh_lookup_errno(tbl, dst, dev);
 			if (IS_ERR(neigh)) {
 				err = PTR_ERR(neigh);
-				goto out_dev_put;
+				goto out;
 			}
 		} else {
 			if (nlh->nlmsg_flags & NLM_F_EXCL) {
 				err = -EEXIST;
 				neigh_release(neigh);
-				goto out_dev_put;
+				goto out;
 			}
 
 			if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
@@ -1674,15 +1774,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		} else
 			err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
 		neigh_release(neigh);
-		goto out_dev_put;
+		goto out;
 	}
 
 	read_unlock(&neigh_tbl_lock);
 	err = -EAFNOSUPPORT;
-
-out_dev_put:
-	if (dev)
-		dev_put(dev);
 out:
 	return err;
 }
@@ -1748,18 +1844,22 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
 		unsigned long now = jiffies;
 		unsigned int flush_delta = now - tbl->last_flush;
 		unsigned int rand_delta = now - tbl->last_rand;
-
+		struct neigh_hash_table *nht;
 		struct ndt_config ndc = {
 			.ndtc_key_len		= tbl->key_len,
 			.ndtc_entry_size	= tbl->entry_size,
 			.ndtc_entries		= atomic_read(&tbl->entries),
 			.ndtc_last_flush	= jiffies_to_msecs(flush_delta),
 			.ndtc_last_rand		= jiffies_to_msecs(rand_delta),
-			.ndtc_hash_rnd		= tbl->hash_rnd,
-			.ndtc_hash_mask		= tbl->hash_mask,
 			.ndtc_proxy_qlen	= tbl->proxy_queue.qlen,
 		};
 
+		rcu_read_lock_bh();
+		nht = rcu_dereference_bh(tbl->nht);
+		ndc.ndtc_hash_rnd = nht->hash_rnd;
+		ndc.ndtc_hash_mask = nht->hash_mask;
+		rcu_read_unlock_bh();
+
 		NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc);
 	}
 
@@ -2056,10 +2156,14 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
 
 	read_lock_bh(&neigh->lock);
 	ndm->ndm_state	 = neigh->nud_state;
-	if ((neigh->nud_state & NUD_VALID) &&
-	    nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, neigh->ha) < 0) {
-		read_unlock_bh(&neigh->lock);
-		goto nla_put_failure;
+	if (neigh->nud_state & NUD_VALID) {
+		char haddr[MAX_ADDR_LEN];
+
+		neigh_ha_snapshot(haddr, neigh, neigh->dev);
+		if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) {
+			read_unlock_bh(&neigh->lock);
+			goto nla_put_failure;
+		}
 	}
 
 	ci.ndm_used	 = jiffies_to_clock_t(now - neigh->used);
@@ -2087,18 +2191,23 @@ static void neigh_update_notify(struct neighbour *neigh)
 static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
 			    struct netlink_callback *cb)
 {
-	struct net * net = sock_net(skb->sk);
+	struct net *net = sock_net(skb->sk);
 	struct neighbour *n;
 	int rc, h, s_h = cb->args[1];
 	int idx, s_idx = idx = cb->args[2];
+	struct neigh_hash_table *nht;
 
-	read_lock_bh(&tbl->lock);
-	for (h = 0; h <= tbl->hash_mask; h++) {
+	rcu_read_lock_bh();
+	nht = rcu_dereference_bh(tbl->nht);
+
+	for (h = 0; h <= nht->hash_mask; h++) {
 		if (h < s_h)
 			continue;
 		if (h > s_h)
 			s_idx = 0;
-		for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next) {
+		for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
+		     n != NULL;
+		     n = rcu_dereference_bh(n->next)) {
 			if (!net_eq(dev_net(n->dev), net))
 				continue;
 			if (idx < s_idx)
@@ -2107,17 +2216,16 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
 					    cb->nlh->nlmsg_seq,
 					    RTM_NEWNEIGH,
 					    NLM_F_MULTI) <= 0) {
-				read_unlock_bh(&tbl->lock);
 				rc = -1;
 				goto out;
 			}
-		next:
+next:
 			idx++;
 		}
 	}
-	read_unlock_bh(&tbl->lock);
 	rc = skb->len;
 out:
+	rcu_read_unlock_bh();
 	cb->args[1] = h;
 	cb->args[2] = idx;
 	return rc;
@@ -2150,15 +2258,22 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
 void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
 {
 	int chain;
+	struct neigh_hash_table *nht;
 
-	read_lock_bh(&tbl->lock);
-	for (chain = 0; chain <= tbl->hash_mask; chain++) {
+	rcu_read_lock_bh();
+	nht = rcu_dereference_bh(tbl->nht);
+
+	read_lock(&tbl->lock); /* avoid resizes */
+	for (chain = 0; chain <= nht->hash_mask; chain++) {
 		struct neighbour *n;
 
-		for (n = tbl->hash_buckets[chain]; n; n = n->next)
+		for (n = rcu_dereference_bh(nht->hash_buckets[chain]);
+		     n != NULL;
+		     n = rcu_dereference_bh(n->next))
 			cb(n, cookie);
 	}
-	read_unlock_bh(&tbl->lock);
+	read_unlock(&tbl->lock);
+	rcu_read_unlock_bh();
 }
 EXPORT_SYMBOL(neigh_for_each);
 
@@ -2167,18 +2282,25 @@ void __neigh_for_each_release(struct neigh_table *tbl,
 			      int (*cb)(struct neighbour *))
 {
 	int chain;
+	struct neigh_hash_table *nht;
 
-	for (chain = 0; chain <= tbl->hash_mask; chain++) {
-		struct neighbour *n, **np;
+	nht = rcu_dereference_protected(tbl->nht,
+					lockdep_is_held(&tbl->lock));
+	for (chain = 0; chain <= nht->hash_mask; chain++) {
+		struct neighbour *n;
+		struct neighbour __rcu **np;
 
-		np = &tbl->hash_buckets[chain];
-		while ((n = *np) != NULL) {
+		np = &nht->hash_buckets[chain];
+		while ((n = rcu_dereference_protected(*np,
+					lockdep_is_held(&tbl->lock))) != NULL) {
 			int release;
 
 			write_lock(&n->lock);
 			release = cb(n);
 			if (release) {
-				*np = n->next;
+				rcu_assign_pointer(*np,
+					rcu_dereference_protected(n->next,
+						lockdep_is_held(&tbl->lock)));
 				n->dead = 1;
 			} else
 				np = &n->next;
@@ -2196,13 +2318,13 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
 {
 	struct neigh_seq_state *state = seq->private;
 	struct net *net = seq_file_net(seq);
-	struct neigh_table *tbl = state->tbl;
+	struct neigh_hash_table *nht = state->nht;
 	struct neighbour *n = NULL;
 	int bucket = state->bucket;
 
 	state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
-	for (bucket = 0; bucket <= tbl->hash_mask; bucket++) {
-		n = tbl->hash_buckets[bucket];
+	for (bucket = 0; bucket <= nht->hash_mask; bucket++) {
+		n = rcu_dereference_bh(nht->hash_buckets[bucket]);
 
 		while (n) {
 			if (!net_eq(dev_net(n->dev), net))
@@ -2219,8 +2341,8 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
 				break;
 			if (n->nud_state & ~NUD_NOARP)
 				break;
-		next:
-			n = n->next;
+next:
+			n = rcu_dereference_bh(n->next);
 		}
 
 		if (n)
@@ -2237,14 +2359,14 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
 {
 	struct neigh_seq_state *state = seq->private;
 	struct net *net = seq_file_net(seq);
-	struct neigh_table *tbl = state->tbl;
+	struct neigh_hash_table *nht = state->nht;
 
 	if (state->neigh_sub_iter) {
 		void *v = state->neigh_sub_iter(state, n, pos);
 		if (v)
 			return n;
 	}
-	n = n->next;
+	n = rcu_dereference_bh(n->next);
 
 	while (1) {
 		while (n) {
@@ -2261,17 +2383,17 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
 
 			if (n->nud_state & ~NUD_NOARP)
 				break;
-		next:
-			n = n->next;
+next:
+			n = rcu_dereference_bh(n->next);
 		}
 
 		if (n)
 			break;
 
-		if (++state->bucket > tbl->hash_mask)
+		if (++state->bucket > nht->hash_mask)
 			break;
 
-		n = tbl->hash_buckets[state->bucket];
+		n = rcu_dereference_bh(nht->hash_buckets[state->bucket]);
 	}
 
 	if (n && pos)
@@ -2369,7 +2491,7 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
 }
 
 void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
-	__acquires(tbl->lock)
+	__acquires(rcu_bh)
 {
 	struct neigh_seq_state *state = seq->private;
 
@@ -2377,7 +2499,8 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl
 	state->bucket = 0;
 	state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
 
-	read_lock_bh(&tbl->lock);
+	rcu_read_lock_bh();
+	state->nht = rcu_dereference_bh(tbl->nht);
 
 	return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
 }
@@ -2411,12 +2534,9 @@ out:
 EXPORT_SYMBOL(neigh_seq_next);
 
 void neigh_seq_stop(struct seq_file *seq, void *v)
-	__releases(tbl->lock)
+	__releases(rcu_bh)
 {
-	struct neigh_seq_state *state = seq->private;
-	struct neigh_table *tbl = state->tbl;
-
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock_bh();
 }
 EXPORT_SYMBOL(neigh_seq_stop);
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 99e7052d732..a5ff5a89f37 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -29,6 +29,7 @@ static const char fmt_hex[] = "%#x\n";
 static const char fmt_long_hex[] = "%#lx\n";
 static const char fmt_dec[] = "%d\n";
 static const char fmt_ulong[] = "%lu\n";
+static const char fmt_u64[] = "%llu\n";
 
 static inline int dev_isalive(const struct net_device *dev)
 {
@@ -94,6 +95,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
 }
 
 NETDEVICE_SHOW(dev_id, fmt_hex);
+NETDEVICE_SHOW(addr_assign_type, fmt_dec);
 NETDEVICE_SHOW(addr_len, fmt_dec);
 NETDEVICE_SHOW(iflink, fmt_dec);
 NETDEVICE_SHOW(ifindex, fmt_dec);
@@ -294,6 +296,7 @@ static ssize_t show_ifalias(struct device *dev,
 }
 
 static struct device_attribute net_class_attributes[] = {
+	__ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL),
 	__ATTR(addr_len, S_IRUGO, show_addr_len, NULL),
 	__ATTR(dev_id, S_IRUGO, show_dev_id, NULL),
 	__ATTR(ifalias, S_IRUGO | S_IWUSR, show_ifalias, store_ifalias),
@@ -324,14 +327,15 @@ static ssize_t netstat_show(const struct device *d,
 	struct net_device *dev = to_net_dev(d);
 	ssize_t ret = -EINVAL;
 
-	WARN_ON(offset > sizeof(struct net_device_stats) ||
-			offset % sizeof(unsigned long) != 0);
+	WARN_ON(offset > sizeof(struct rtnl_link_stats64) ||
+			offset % sizeof(u64) != 0);
 
 	read_lock(&dev_base_lock);
 	if (dev_isalive(dev)) {
-		const struct net_device_stats *stats = dev_get_stats(dev);
-		ret = sprintf(buf, fmt_ulong,
-			      *(unsigned long *)(((u8 *) stats) + offset));
+		struct rtnl_link_stats64 temp;
+		const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
+
+		ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *) stats) + offset));
 	}
 	read_unlock(&dev_base_lock);
 	return ret;
@@ -343,7 +347,7 @@ static ssize_t show_##name(struct device *d,				\
 			   struct device_attribute *attr, char *buf) 	\
 {									\
 	return netstat_show(d, attr, buf,				\
-			    offsetof(struct net_device_stats, name));	\
+			    offsetof(struct rtnl_link_stats64, name));	\
 }									\
 static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
 
@@ -511,7 +515,7 @@ static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
 	return attribute->store(queue, attribute, buf, count);
 }
 
-static struct sysfs_ops rx_queue_sysfs_ops = {
+static const struct sysfs_ops rx_queue_sysfs_ops = {
 	.show = rx_queue_attr_show,
 	.store = rx_queue_attr_store,
 };
@@ -594,7 +598,8 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
 	}
 
 	spin_lock(&rps_map_lock);
-	old_map = queue->rps_map;
+	old_map = rcu_dereference_protected(queue->rps_map,
+					    lockdep_is_held(&rps_map_lock));
 	rcu_assign_pointer(queue->rps_map, map);
 	spin_unlock(&rps_map_lock);
 
@@ -673,7 +678,8 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
 		table = NULL;
 
 	spin_lock(&rps_dev_flow_lock);
-	old_table = queue->rps_flow_table;
+	old_table = rcu_dereference_protected(queue->rps_flow_table,
+					      lockdep_is_held(&rps_dev_flow_lock));
 	rcu_assign_pointer(queue->rps_flow_table, table);
 	spin_unlock(&rps_dev_flow_lock);
 
@@ -701,13 +707,17 @@ static void rx_queue_release(struct kobject *kobj)
 {
 	struct netdev_rx_queue *queue = to_rx_queue(kobj);
 	struct netdev_rx_queue *first = queue->first;
+	struct rps_map *map;
+	struct rps_dev_flow_table *flow_table;
+
 
-	if (queue->rps_map)
-		call_rcu(&queue->rps_map->rcu, rps_map_release);
+	map = rcu_dereference_raw(queue->rps_map);
+	if (map)
+		call_rcu(&map->rcu, rps_map_release);
 
-	if (queue->rps_flow_table)
-		call_rcu(&queue->rps_flow_table->rcu,
-		    rps_dev_flow_table_release);
+	flow_table = rcu_dereference_raw(queue->rps_flow_table);
+	if (flow_table)
+		call_rcu(&flow_table->rcu, rps_dev_flow_table_release);
 
 	if (atomic_dec_and_test(&first->count))
 		kfree(first);
@@ -722,6 +732,7 @@ static struct kobj_type rx_queue_ktype = {
 static int rx_queue_add_kobject(struct net_device *net, int index)
 {
 	struct netdev_rx_queue *queue = net->_rx + index;
+	struct netdev_rx_queue *first = queue->first;
 	struct kobject *kobj = &queue->kobj;
 	int error = 0;
 
@@ -734,38 +745,43 @@ static int rx_queue_add_kobject(struct net_device *net, int index)
 	}
 
 	kobject_uevent(kobj, KOBJ_ADD);
+	atomic_inc(&first->count);
 
 	return error;
 }
 
-static int rx_queue_register_kobjects(struct net_device *net)
+int
+net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
 {
 	int i;
 	int error = 0;
 
-	net->queues_kset = kset_create_and_add("queues",
-	    NULL, &net->dev.kobj);
-	if (!net->queues_kset)
-		return -ENOMEM;
-	for (i = 0; i < net->num_rx_queues; i++) {
+	for (i = old_num; i < new_num; i++) {
 		error = rx_queue_add_kobject(net, i);
-		if (error)
+		if (error) {
+			new_num = old_num;
 			break;
+		}
 	}
 
-	if (error)
-		while (--i >= 0)
-			kobject_put(&net->_rx[i].kobj);
+	while (--i >= new_num)
+		kobject_put(&net->_rx[i].kobj);
 
 	return error;
 }
 
-static void rx_queue_remove_kobjects(struct net_device *net)
+static int rx_queue_register_kobjects(struct net_device *net)
 {
-	int i;
+	net->queues_kset = kset_create_and_add("queues",
+	    NULL, &net->dev.kobj);
+	if (!net->queues_kset)
+		return -ENOMEM;
+	return net_rx_queue_update_kobjects(net, 0, net->real_num_rx_queues);
+}
 
-	for (i = 0; i < net->num_rx_queues; i++)
-		kobject_put(&net->_rx[i].kobj);
+static void rx_queue_remove_kobjects(struct net_device *net)
+{
+	net_rx_queue_update_kobjects(net, net->real_num_rx_queues, 0);
 	kset_unregister(net->queues_kset);
 }
 #endif /* CONFIG_RPS */
@@ -785,12 +801,13 @@ static const void *net_netlink_ns(struct sock *sk)
 	return sock_net(sk);
 }
 
-static struct kobj_ns_type_operations net_ns_type_operations = {
+struct kobj_ns_type_operations net_ns_type_operations = {
 	.type = KOBJ_NS_TYPE_NET,
 	.current_ns = net_current_ns,
 	.netlink_ns = net_netlink_ns,
 	.initial_ns = net_initial_ns,
 };
+EXPORT_SYMBOL_GPL(net_ns_type_operations);
 
 static void net_kobj_ns_exit(struct net *net)
 {
@@ -922,13 +939,12 @@ int netdev_class_create_file(struct class_attribute *class_attr)
 {
 	return class_create_file(&net_class, class_attr);
 }
+EXPORT_SYMBOL(netdev_class_create_file);
 
 void netdev_class_remove_file(struct class_attribute *class_attr)
 {
 	class_remove_file(&net_class, class_attr);
 }
-
-EXPORT_SYMBOL(netdev_class_create_file);
 EXPORT_SYMBOL(netdev_class_remove_file);
 
 int netdev_kobject_init(void)
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
index 805555e8b18..778e1571548 100644
--- a/net/core/net-sysfs.h
+++ b/net/core/net-sysfs.h
@@ -4,4 +4,8 @@
 int netdev_kobject_init(void);
 int netdev_register_kobject(struct net_device *);
 void netdev_unregister_kobject(struct net_device *);
+#ifdef CONFIG_RPS
+int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
+#endif
+
 #endif
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index afa6380ed88..7f1bb2aba03 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -26,6 +26,7 @@
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/skb.h>
+#include <trace/events/net.h>
 #include <trace/events/napi.h>
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index c988e685433..3f860261c5e 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -42,7 +42,9 @@ static int net_assign_generic(struct net *net, int id, void *data)
 	BUG_ON(!mutex_is_locked(&net_mutex));
 	BUG_ON(id == 0);
 
-	ng = old_ng = net->gen;
+	old_ng = rcu_dereference_protected(net->gen,
+					   lockdep_is_held(&net_mutex));
+	ng = old_ng;
 	if (old_ng->len >= id)
 		goto assign;
 
diff --git a/net/core/netevent.c b/net/core/netevent.c
index 95f81de8750..865f0ceb81f 100644
--- a/net/core/netevent.c
+++ b/net/core/netevent.c
@@ -35,6 +35,7 @@ int register_netevent_notifier(struct notifier_block *nb)
 	err = atomic_notifier_chain_register(&netevent_notif_chain, nb);
 	return err;
 }
+EXPORT_SYMBOL_GPL(register_netevent_notifier);
 
 /**
  *	netevent_unregister_notifier - unregister a netevent notifier block
@@ -50,6 +51,7 @@ int unregister_netevent_notifier(struct notifier_block *nb)
 {
 	return atomic_notifier_chain_unregister(&netevent_notif_chain, nb);
 }
+EXPORT_SYMBOL_GPL(unregister_netevent_notifier);
 
 /**
  *	call_netevent_notifiers - call all netevent notifier blocks
@@ -64,7 +66,4 @@ int call_netevent_notifiers(unsigned long val, void *v)
 {
 	return atomic_notifier_call_chain(&netevent_notif_chain, val, v);
 }
-
-EXPORT_SYMBOL_GPL(register_netevent_notifier);
-EXPORT_SYMBOL_GPL(unregister_netevent_notifier);
 EXPORT_SYMBOL_GPL(call_netevent_notifiers);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 94825b10955..4e98ffac3af 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -199,11 +199,13 @@ void netpoll_poll_dev(struct net_device *dev)
 
 	zap_completion_queue();
 }
+EXPORT_SYMBOL(netpoll_poll_dev);
 
 void netpoll_poll(struct netpoll *np)
 {
 	netpoll_poll_dev(np->dev);
 }
+EXPORT_SYMBOL(netpoll_poll);
 
 static void refill_skbs(void)
 {
@@ -286,12 +288,13 @@ static int netpoll_owner_active(struct net_device *dev)
 	return 0;
 }
 
-void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
+			     struct net_device *dev)
 {
 	int status = NETDEV_TX_BUSY;
 	unsigned long tries;
-	struct net_device *dev = np->dev;
 	const struct net_device_ops *ops = dev->netdev_ops;
+	/* It is up to the caller to keep npinfo alive. */
 	struct netpoll_info *npinfo = np->dev->npinfo;
 
 	if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
@@ -343,6 +346,7 @@ void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 		schedule_delayed_work(&npinfo->tx_work,0);
 	}
 }
+EXPORT_SYMBOL(netpoll_send_skb_on_dev);
 
 void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 {
@@ -404,6 +408,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 
 	netpoll_send_skb(np, skb);
 }
+EXPORT_SYMBOL(netpoll_send_udp);
 
 static void arp_reply(struct sk_buff *skb)
 {
@@ -630,6 +635,7 @@ void netpoll_print_options(struct netpoll *np)
 	printk(KERN_INFO "%s: remote ethernet address %pM\n",
 	                 np->name, np->remote_mac);
 }
+EXPORT_SYMBOL(netpoll_print_options);
 
 int netpoll_parse_options(struct netpoll *np, char *opt)
 {
@@ -722,30 +728,29 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
 	       np->name, cur);
 	return -1;
 }
+EXPORT_SYMBOL(netpoll_parse_options);
 
-int netpoll_setup(struct netpoll *np)
+int __netpoll_setup(struct netpoll *np)
 {
-	struct net_device *ndev = NULL;
-	struct in_device *in_dev;
+	struct net_device *ndev = np->dev;
 	struct netpoll_info *npinfo;
-	struct netpoll *npe, *tmp;
+	const struct net_device_ops *ops;
 	unsigned long flags;
 	int err;
 
-	if (np->dev_name)
-		ndev = dev_get_by_name(&init_net, np->dev_name);
-	if (!ndev) {
-		printk(KERN_ERR "%s: %s doesn't exist, aborting.\n",
+	if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
+	    !ndev->netdev_ops->ndo_poll_controller) {
+		printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
 		       np->name, np->dev_name);
-		return -ENODEV;
+		err = -ENOTSUPP;
+		goto out;
 	}
 
-	np->dev = ndev;
 	if (!ndev->npinfo) {
 		npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
 		if (!npinfo) {
 			err = -ENOMEM;
-			goto put;
+			goto out;
 		}
 
 		npinfo->rx_flags = 0;
@@ -757,6 +762,13 @@ int netpoll_setup(struct netpoll *np)
 		INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
 
 		atomic_set(&npinfo->refcnt, 1);
+
+		ops = np->dev->netdev_ops;
+		if (ops->ndo_netpoll_setup) {
+			err = ops->ndo_netpoll_setup(ndev, npinfo);
+			if (err)
+				goto free_npinfo;
+		}
 	} else {
 		npinfo = ndev->npinfo;
 		atomic_inc(&npinfo->refcnt);
@@ -764,12 +776,37 @@ int netpoll_setup(struct netpoll *np)
 
 	npinfo->netpoll = np;
 
-	if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
-	    !ndev->netdev_ops->ndo_poll_controller) {
-		printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
+	if (np->rx_hook) {
+		spin_lock_irqsave(&npinfo->rx_lock, flags);
+		npinfo->rx_flags |= NETPOLL_RX_ENABLED;
+		list_add_tail(&np->rx, &npinfo->rx_np);
+		spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+	}
+
+	/* last thing to do is link it to the net device structure */
+	rcu_assign_pointer(ndev->npinfo, npinfo);
+
+	return 0;
+
+free_npinfo:
+	kfree(npinfo);
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(__netpoll_setup);
+
+int netpoll_setup(struct netpoll *np)
+{
+	struct net_device *ndev = NULL;
+	struct in_device *in_dev;
+	int err;
+
+	if (np->dev_name)
+		ndev = dev_get_by_name(&init_net, np->dev_name);
+	if (!ndev) {
+		printk(KERN_ERR "%s: %s doesn't exist, aborting.\n",
 		       np->name, np->dev_name);
-		err = -ENOTSUPP;
-		goto release;
+		return -ENODEV;
 	}
 
 	if (!netif_running(ndev)) {
@@ -785,7 +822,7 @@ int netpoll_setup(struct netpoll *np)
 		if (err) {
 			printk(KERN_ERR "%s: failed to open %s\n",
 			       np->name, ndev->name);
-			goto release;
+			goto put;
 		}
 
 		atleast = jiffies + HZ/10;
@@ -822,7 +859,7 @@ int netpoll_setup(struct netpoll *np)
 			printk(KERN_ERR "%s: no IP address for %s, aborting\n",
 			       np->name, np->dev_name);
 			err = -EDESTADDRREQ;
-			goto release;
+			goto put;
 		}
 
 		np->local_ip = in_dev->ifa_list->ifa_local;
@@ -830,38 +867,25 @@ int netpoll_setup(struct netpoll *np)
 		printk(KERN_INFO "%s: local IP %pI4\n", np->name, &np->local_ip);
 	}
 
-	if (np->rx_hook) {
-		spin_lock_irqsave(&npinfo->rx_lock, flags);
-		npinfo->rx_flags |= NETPOLL_RX_ENABLED;
-		list_add_tail(&np->rx, &npinfo->rx_np);
-		spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-	}
+	np->dev = ndev;
 
 	/* fill up the skb queue */
 	refill_skbs();
 
-	/* last thing to do is link it to the net device structure */
-	ndev->npinfo = npinfo;
+	rtnl_lock();
+	err = __netpoll_setup(np);
+	rtnl_unlock();
 
-	/* avoid racing with NAPI reading npinfo */
-	synchronize_rcu();
+	if (err)
+		goto put;
 
 	return 0;
 
- release:
-	if (!ndev->npinfo) {
-		spin_lock_irqsave(&npinfo->rx_lock, flags);
-		list_for_each_entry_safe(npe, tmp, &npinfo->rx_np, rx) {
-			npe->dev = NULL;
-		}
-		spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-
-		kfree(npinfo);
-	}
 put:
 	dev_put(ndev);
 	return err;
 }
+EXPORT_SYMBOL(netpoll_setup);
 
 static int __init netpoll_init(void)
 {
@@ -870,49 +894,65 @@ static int __init netpoll_init(void)
 }
 core_initcall(netpoll_init);
 
-void netpoll_cleanup(struct netpoll *np)
+void __netpoll_cleanup(struct netpoll *np)
 {
 	struct netpoll_info *npinfo;
 	unsigned long flags;
 
-	if (np->dev) {
-		npinfo = np->dev->npinfo;
-		if (npinfo) {
-			if (!list_empty(&npinfo->rx_np)) {
-				spin_lock_irqsave(&npinfo->rx_lock, flags);
-				list_del(&np->rx);
-				if (list_empty(&npinfo->rx_np))
-					npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
-				spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-			}
+	npinfo = np->dev->npinfo;
+	if (!npinfo)
+		return;
 
-			if (atomic_dec_and_test(&npinfo->refcnt)) {
-				const struct net_device_ops *ops;
-				skb_queue_purge(&npinfo->arp_tx);
-				skb_queue_purge(&npinfo->txq);
-				cancel_rearming_delayed_work(&npinfo->tx_work);
-
-				/* clean after last, unfinished work */
-				__skb_queue_purge(&npinfo->txq);
-				kfree(npinfo);
-				ops = np->dev->netdev_ops;
-				if (ops->ndo_netpoll_cleanup)
-					ops->ndo_netpoll_cleanup(np->dev);
-				else
-					np->dev->npinfo = NULL;
-			}
-		}
+	if (!list_empty(&npinfo->rx_np)) {
+		spin_lock_irqsave(&npinfo->rx_lock, flags);
+		list_del(&np->rx);
+		if (list_empty(&npinfo->rx_np))
+			npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
+		spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+	}
+
+	if (atomic_dec_and_test(&npinfo->refcnt)) {
+		const struct net_device_ops *ops;
+
+		ops = np->dev->netdev_ops;
+		if (ops->ndo_netpoll_cleanup)
+			ops->ndo_netpoll_cleanup(np->dev);
+
+		rcu_assign_pointer(np->dev->npinfo, NULL);
+
+		/* avoid racing with NAPI reading npinfo */
+		synchronize_rcu_bh();
+
+		skb_queue_purge(&npinfo->arp_tx);
+		skb_queue_purge(&npinfo->txq);
+		cancel_rearming_delayed_work(&npinfo->tx_work);
 
-		dev_put(np->dev);
+		/* clean after last, unfinished work */
+		__skb_queue_purge(&npinfo->txq);
+		kfree(npinfo);
 	}
+}
+EXPORT_SYMBOL_GPL(__netpoll_cleanup);
+
+void netpoll_cleanup(struct netpoll *np)
+{
+	if (!np->dev)
+		return;
 
+	rtnl_lock();
+	__netpoll_cleanup(np);
+	rtnl_unlock();
+
+	dev_put(np->dev);
 	np->dev = NULL;
 }
+EXPORT_SYMBOL(netpoll_cleanup);
 
 int netpoll_trap(void)
 {
 	return atomic_read(&trapped);
 }
+EXPORT_SYMBOL(netpoll_trap);
 
 void netpoll_set_trap(int trap)
 {
@@ -921,14 +961,4 @@ void netpoll_set_trap(int trap)
 	else
 		atomic_dec(&trapped);
 }
-
-EXPORT_SYMBOL(netpoll_send_skb);
 EXPORT_SYMBOL(netpoll_set_trap);
-EXPORT_SYMBOL(netpoll_trap);
-EXPORT_SYMBOL(netpoll_print_options);
-EXPORT_SYMBOL(netpoll_parse_options);
-EXPORT_SYMBOL(netpoll_setup);
-EXPORT_SYMBOL(netpoll_cleanup);
-EXPORT_SYMBOL(netpoll_send_udp);
-EXPORT_SYMBOL(netpoll_poll_dev);
-EXPORT_SYMBOL(netpoll_poll);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 1dacd7ba8db..fbce4b05a53 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -115,6 +115,9 @@
  * command by Adit Ranadive <adit.262@gmail.com>
  *
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/sys.h>
 #include <linux/types.h>
 #include <linux/module.h>
@@ -169,11 +172,13 @@
 #include <asm/dma.h>
 #include <asm/div64.h>		/* do_div */
 
-#define VERSION 	"2.73"
+#define VERSION	"2.74"
 #define IP_NAME_SZ 32
 #define MAX_MPLS_LABELS 16 /* This is the max label stack depth */
 #define MPLS_STACK_BOTTOM htonl(0x00000100)
 
+#define func_enter() pr_debug("entering %s\n", __func__);
+
 /* Device flag bits */
 #define F_IPSRC_RND   (1<<0)	/* IP-Src Random  */
 #define F_IPDST_RND   (1<<1)	/* IP-Dst Random  */
@@ -424,7 +429,8 @@ static inline int ktime_lt(const ktime_t cmp1, const ktime_t cmp2)
 }
 
 static const char version[] =
-	"pktgen " VERSION ": Packet Generator for packet performance testing.\n";
+	"Packet Generator for packet performance testing. "
+	"Version: " VERSION "\n";
 
 static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i);
 static int pktgen_add_device(struct pktgen_thread *t, const char *ifname);
@@ -495,7 +501,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
 		pktgen_reset_all_threads();
 
 	else
-		printk(KERN_WARNING "pktgen: Unknown command: %s\n", data);
+		pr_warning("Unknown command: %s\n", data);
 
 	err = count;
 
@@ -723,16 +729,14 @@ static int hex32_arg(const char __user *user_buffer, unsigned long maxlen,
 	*num = 0;
 
 	for (; i < maxlen; i++) {
+		int value;
 		char c;
 		*num <<= 4;
 		if (get_user(c, &user_buffer[i]))
 			return -EFAULT;
-		if ((c >= '0') && (c <= '9'))
-			*num |= c - '0';
-		else if ((c >= 'a') && (c <= 'f'))
-			*num |= c - 'a' + 10;
-		else if ((c >= 'A') && (c <= 'F'))
-			*num |= c - 'A' + 10;
+		value = hex_to_bin(c);
+		if (value >= 0)
+			*num |= value;
 		else
 			break;
 	}
@@ -767,10 +771,10 @@ done:
 static unsigned long num_arg(const char __user * user_buffer,
 			     unsigned long maxlen, unsigned long *num)
 {
-	int i = 0;
+	int i;
 	*num = 0;
 
-	for (; i < maxlen; i++) {
+	for (i = 0; i < maxlen; i++) {
 		char c;
 		if (get_user(c, &user_buffer[i]))
 			return -EFAULT;
@@ -785,9 +789,9 @@ static unsigned long num_arg(const char __user * user_buffer,
 
 static int strn_len(const char __user * user_buffer, unsigned int maxlen)
 {
-	int i = 0;
+	int i;
 
-	for (; i < maxlen; i++) {
+	for (i = 0; i < maxlen; i++) {
 		char c;
 		if (get_user(c, &user_buffer[i]))
 			return -EFAULT;
@@ -840,9 +844,9 @@ static ssize_t pktgen_if_write(struct file *file,
 			       const char __user * user_buffer, size_t count,
 			       loff_t * offset)
 {
-	struct seq_file *seq = (struct seq_file *)file->private_data;
+	struct seq_file *seq = file->private_data;
 	struct pktgen_dev *pkt_dev = seq->private;
-	int i = 0, max, len;
+	int i, max, len;
 	char name[16], valstr[32];
 	unsigned long value = 0;
 	char *pg_result = NULL;
@@ -852,17 +856,17 @@ static ssize_t pktgen_if_write(struct file *file,
 	pg_result = &(pkt_dev->result[0]);
 
 	if (count < 1) {
-		printk(KERN_WARNING "pktgen: wrong command format\n");
+		pr_warning("wrong command format\n");
 		return -EINVAL;
 	}
 
-	max = count - i;
-	tmp = count_trail_chars(&user_buffer[i], max);
+	max = count;
+	tmp = count_trail_chars(user_buffer, max);
 	if (tmp < 0) {
-		printk(KERN_WARNING "pktgen: illegal format\n");
+		pr_warning("illegal format\n");
 		return tmp;
 	}
-	i += tmp;
+	i = tmp;
 
 	/* Read variable name */
 
@@ -883,10 +887,11 @@ static ssize_t pktgen_if_write(struct file *file,
 	i += len;
 
 	if (debug) {
-		char tb[count + 1];
-		if (copy_from_user(tb, user_buffer, count))
+		size_t copy = min(count, 1023);
+		char tb[copy + 1];
+		if (copy_from_user(tb, user_buffer, copy))
 			return -EFAULT;
-		tb[count] = 0;
+		tb[copy] = 0;
 		printk(KERN_DEBUG "pktgen: %s,%lu  buffer -:%s:-\n", name,
 		       (unsigned long)count, tb);
 	}
@@ -980,6 +985,36 @@ static ssize_t pktgen_if_write(struct file *file,
 			(unsigned long long) pkt_dev->delay);
 		return count;
 	}
+	if (!strcmp(name, "rate")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (!value)
+			return len;
+		pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value;
+		if (debug)
+			pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
+
+		sprintf(pg_result, "OK: rate=%lu", value);
+		return count;
+	}
+	if (!strcmp(name, "ratep")) {
+		len = num_arg(&user_buffer[i], 10, &value);
+		if (len < 0)
+			return len;
+
+		i += len;
+		if (!value)
+			return len;
+		pkt_dev->delay = NSEC_PER_SEC/value;
+		if (debug)
+			pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
+
+		sprintf(pg_result, "OK: rate=%lu", value);
+		return count;
+	}
 	if (!strcmp(name, "udp_src_min")) {
 		len = num_arg(&user_buffer[i], 10, &value);
 		if (len < 0)
@@ -1398,18 +1433,12 @@ static ssize_t pktgen_if_write(struct file *file,
 		i += len;
 
 		for (*m = 0; *v && m < pkt_dev->dst_mac + 6; v++) {
-			if (*v >= '0' && *v <= '9') {
-				*m *= 16;
-				*m += *v - '0';
-			}
-			if (*v >= 'A' && *v <= 'F') {
-				*m *= 16;
-				*m += *v - 'A' + 10;
-			}
-			if (*v >= 'a' && *v <= 'f') {
-				*m *= 16;
-				*m += *v - 'a' + 10;
-			}
+			int value;
+
+			value = hex_to_bin(*v);
+			if (value >= 0)
+				*m = *m * 16 + value;
+
 			if (*v == ':') {
 				m++;
 				*m = 0;
@@ -1440,18 +1469,12 @@ static ssize_t pktgen_if_write(struct file *file,
 		i += len;
 
 		for (*m = 0; *v && m < pkt_dev->src_mac + 6; v++) {
-			if (*v >= '0' && *v <= '9') {
-				*m *= 16;
-				*m += *v - '0';
-			}
-			if (*v >= 'A' && *v <= 'F') {
-				*m *= 16;
-				*m += *v - 'A' + 10;
-			}
-			if (*v >= 'a' && *v <= 'f') {
-				*m *= 16;
-				*m += *v - 'a' + 10;
-			}
+			int value;
+
+			value = hex_to_bin(*v);
+			if (value >= 0)
+				*m = *m * 16 + value;
+
 			if (*v == ':') {
 				m++;
 				*m = 0;
@@ -1740,9 +1763,9 @@ static ssize_t pktgen_thread_write(struct file *file,
 				   const char __user * user_buffer,
 				   size_t count, loff_t * offset)
 {
-	struct seq_file *seq = (struct seq_file *)file->private_data;
+	struct seq_file *seq = file->private_data;
 	struct pktgen_thread *t = seq->private;
-	int i = 0, max, len, ret;
+	int i, max, len, ret;
 	char name[40];
 	char *pg_result;
 
@@ -1751,12 +1774,12 @@ static ssize_t pktgen_thread_write(struct file *file,
 		return -EINVAL;
 	}
 
-	max = count - i;
-	len = count_trail_chars(&user_buffer[i], max);
+	max = count;
+	len = count_trail_chars(user_buffer, max);
 	if (len < 0)
 		return len;
 
-	i += len;
+	i = len;
 
 	/* Read variable name */
 
@@ -1781,7 +1804,7 @@ static ssize_t pktgen_thread_write(struct file *file,
 		       name, (unsigned long)count);
 
 	if (!t) {
-		printk(KERN_ERR "pktgen: ERROR: No thread\n");
+		pr_err("ERROR: No thread\n");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1874,7 +1897,7 @@ static void pktgen_mark_device(const char *ifname)
 	int i = 0;
 
 	mutex_lock(&pktgen_thread_lock);
-	pr_debug("pktgen: pktgen_mark_device marking %s for removal\n", ifname);
+	pr_debug("%s: marking %s for removal\n", __func__, ifname);
 
 	while (1) {
 
@@ -1883,15 +1906,14 @@ static void pktgen_mark_device(const char *ifname)
 			break;	/* success */
 
 		mutex_unlock(&pktgen_thread_lock);
-		pr_debug("pktgen: pktgen_mark_device waiting for %s "
-				"to disappear....\n", ifname);
+		pr_debug("%s: waiting for %s to disappear....\n",
+			 __func__, ifname);
 		schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try));
 		mutex_lock(&pktgen_thread_lock);
 
 		if (++i >= max_tries) {
-			printk(KERN_ERR "pktgen_mark_device: timed out after "
-			       "waiting %d msec for device %s to be removed\n",
-			       msec_per_try * i, ifname);
+			pr_err("%s: timed out after waiting %d msec for device %s to be removed\n",
+			       __func__, msec_per_try * i, ifname);
 			break;
 		}
 
@@ -1918,8 +1940,8 @@ static void pktgen_change_name(struct net_device *dev)
 							  &pktgen_if_fops,
 							  pkt_dev);
 			if (!pkt_dev->entry)
-				printk(KERN_ERR "pktgen: can't move proc "
-				       " entry for '%s'\n", dev->name);
+				pr_err("can't move proc entry for '%s'\n",
+				       dev->name);
 			break;
 		}
 	}
@@ -1954,7 +1976,7 @@ static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev,
 						 const char *ifname)
 {
 	char b[IFNAMSIZ+5];
-	int i = 0;
+	int i;
 
 	for (i = 0; ifname[i] != '@'; i++) {
 		if (i == IFNAMSIZ)
@@ -1983,15 +2005,15 @@ static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname)
 
 	odev = pktgen_dev_get_by_name(pkt_dev, ifname);
 	if (!odev) {
-		printk(KERN_ERR "pktgen: no such netdevice: \"%s\"\n", ifname);
+		pr_err("no such netdevice: \"%s\"\n", ifname);
 		return -ENODEV;
 	}
 
 	if (odev->type != ARPHRD_ETHER) {
-		printk(KERN_ERR "pktgen: not an ethernet device: \"%s\"\n", ifname);
+		pr_err("not an ethernet device: \"%s\"\n", ifname);
 		err = -EINVAL;
 	} else if (!netif_running(odev)) {
-		printk(KERN_ERR "pktgen: device is down: \"%s\"\n", ifname);
+		pr_err("device is down: \"%s\"\n", ifname);
 		err = -ENETDOWN;
 	} else {
 		pkt_dev->odev = odev;
@@ -2010,8 +2032,7 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 	int ntxq;
 
 	if (!pkt_dev->odev) {
-		printk(KERN_ERR "pktgen: ERROR: pkt_dev->odev == NULL in "
-		       "setup_inject.\n");
+		pr_err("ERROR: pkt_dev->odev == NULL in setup_inject\n");
 		sprintf(pkt_dev->result,
 			"ERROR: pkt_dev->odev == NULL in setup_inject.\n");
 		return;
@@ -2021,19 +2042,15 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 	ntxq = pkt_dev->odev->real_num_tx_queues;
 
 	if (ntxq <= pkt_dev->queue_map_min) {
-		printk(KERN_WARNING "pktgen: WARNING: Requested "
-		       "queue_map_min (zero-based) (%d) exceeds valid range "
-		       "[0 - %d] for (%d) queues on %s, resetting\n",
-		       pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq,
-		       pkt_dev->odevname);
+		pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
+			   pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq,
+			   pkt_dev->odevname);
 		pkt_dev->queue_map_min = ntxq - 1;
 	}
 	if (pkt_dev->queue_map_max >= ntxq) {
-		printk(KERN_WARNING "pktgen: WARNING: Requested "
-		       "queue_map_max (zero-based) (%d) exceeds valid range "
-		       "[0 - %d] for (%d) queues on %s, resetting\n",
-		       pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq,
-		       pkt_dev->odevname);
+		pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
+			   pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq,
+			   pkt_dev->odevname);
 		pkt_dev->queue_map_max = ntxq - 1;
 	}
 
@@ -2093,8 +2110,7 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 			}
 			rcu_read_unlock();
 			if (err)
-				printk(KERN_ERR "pktgen: ERROR: IPv6 link "
-				       "address not availble.\n");
+				pr_err("ERROR: IPv6 link address not available\n");
 		}
 #endif
 	} else {
@@ -2142,15 +2158,15 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
 	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	hrtimer_set_expires(&t.timer, spin_until);
 
-	remaining = ktime_to_us(hrtimer_expires_remaining(&t.timer));
+	remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
 	if (remaining <= 0) {
 		pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
 		return;
 	}
 
 	start_time = ktime_now();
-	if (remaining < 100)
-		udelay(remaining); 	/* really small just spin */
+	if (remaining < 100000)
+		ndelay(remaining);	/* really small just spin */
 	else {
 		/* see do_nanosleep */
 		hrtimer_init_sleeper(&t, current);
@@ -2504,8 +2520,8 @@ static void free_SAs(struct pktgen_dev *pkt_dev)
 {
 	if (pkt_dev->cflows) {
 		/* let go of the SAs if we have them */
-		int i = 0;
-		for (;  i < pkt_dev->cflows; i++) {
+		int i;
+		for (i = 0; i < pkt_dev->cflows; i++) {
 			struct xfrm_state *x = pkt_dev->flows[i].x;
 			if (x) {
 				xfrm_state_put(x);
@@ -2528,8 +2544,8 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,
 			if (nhead > 0) {
 				ret = pskb_expand_head(skb, nhead, 0, GFP_ATOMIC);
 				if (ret < 0) {
-					printk(KERN_ERR "Error expanding "
-					       "ipsec packet %d\n", ret);
+					pr_err("Error expanding ipsec packet %d\n",
+					       ret);
 					goto err;
 				}
 			}
@@ -2538,8 +2554,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,
 			skb_pull(skb, ETH_HLEN);
 			ret = pktgen_output_ipsec(skb, pkt_dev);
 			if (ret) {
-				printk(KERN_ERR "Error creating ipsec "
-				       "packet %d\n", ret);
+				pr_err("Error creating ipsec packet %d\n", ret);
 				goto err;
 			}
 			/* restore ll */
@@ -3015,8 +3030,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
 	if (datalen < sizeof(struct pktgen_hdr)) {
 		datalen = sizeof(struct pktgen_hdr);
 		if (net_ratelimit())
-			printk(KERN_INFO "pktgen: increased datalen to %d\n",
-			       datalen);
+			pr_info("increased datalen to %d\n", datalen);
 	}
 
 	udph->source = htons(pkt_dev->cur_udp_src);
@@ -3143,7 +3157,7 @@ static void pktgen_run(struct pktgen_thread *t)
 	struct pktgen_dev *pkt_dev;
 	int started = 0;
 
-	pr_debug("pktgen: entering pktgen_run. %p\n", t);
+	func_enter();
 
 	if_lock(t);
 	list_for_each_entry(pkt_dev, &t->if_list, list) {
@@ -3176,7 +3190,7 @@ static void pktgen_stop_all_threads_ifs(void)
 {
 	struct pktgen_thread *t;
 
-	pr_debug("pktgen: entering pktgen_stop_all_threads_ifs.\n");
+	func_enter();
 
 	mutex_lock(&pktgen_thread_lock);
 
@@ -3241,7 +3255,7 @@ static void pktgen_run_all_threads(void)
 {
 	struct pktgen_thread *t;
 
-	pr_debug("pktgen: entering pktgen_run_all_threads.\n");
+	func_enter();
 
 	mutex_lock(&pktgen_thread_lock);
 
@@ -3260,7 +3274,7 @@ static void pktgen_reset_all_threads(void)
 {
 	struct pktgen_thread *t;
 
-	pr_debug("pktgen: entering pktgen_reset_all_threads.\n");
+	func_enter();
 
 	mutex_lock(&pktgen_thread_lock);
 
@@ -3310,8 +3324,8 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
 	int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1;
 
 	if (!pkt_dev->running) {
-		printk(KERN_WARNING "pktgen: interface: %s is already "
-		       "stopped\n", pkt_dev->odevname);
+		pr_warning("interface: %s is already stopped\n",
+			   pkt_dev->odevname);
 		return -EINVAL;
 	}
 
@@ -3347,7 +3361,7 @@ static void pktgen_stop(struct pktgen_thread *t)
 {
 	struct pktgen_dev *pkt_dev;
 
-	pr_debug("pktgen: entering pktgen_stop\n");
+	func_enter();
 
 	if_lock(t);
 
@@ -3367,7 +3381,7 @@ static void pktgen_rem_one_if(struct pktgen_thread *t)
 	struct list_head *q, *n;
 	struct pktgen_dev *cur;
 
-	pr_debug("pktgen: entering pktgen_rem_one_if\n");
+	func_enter();
 
 	if_lock(t);
 
@@ -3393,9 +3407,10 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t)
 	struct list_head *q, *n;
 	struct pktgen_dev *cur;
 
+	func_enter();
+
 	/* Remove all devices, free mem */
 
-	pr_debug("pktgen: entering pktgen_rem_all_ifs\n");
 	if_lock(t);
 
 	list_for_each_safe(q, n, &t->if_list) {
@@ -3477,8 +3492,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 
 		pkt_dev->skb = fill_packet(odev, pkt_dev);
 		if (pkt_dev->skb == NULL) {
-			printk(KERN_ERR "pktgen: ERROR: couldn't "
-			       "allocate skb in fill_packet.\n");
+			pr_err("ERROR: couldn't allocate skb in fill_packet\n");
 			schedule();
 			pkt_dev->clone_count--;	/* back out increment, OOM */
 			return;
@@ -3558,8 +3572,7 @@ static int pktgen_thread_worker(void *arg)
 	init_waitqueue_head(&t->queue);
 	complete(&t->start_done);
 
-	pr_debug("pktgen: starting pktgen/%d:  pid=%d\n",
-		 cpu, task_pid_nr(current));
+	pr_debug("starting pktgen/%d:  pid=%d\n", cpu, task_pid_nr(current));
 
 	set_current_state(TASK_INTERRUPTIBLE);
 
@@ -3612,13 +3625,13 @@ static int pktgen_thread_worker(void *arg)
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 
-	pr_debug("pktgen: %s stopping all device\n", t->tsk->comm);
+	pr_debug("%s stopping all device\n", t->tsk->comm);
 	pktgen_stop(t);
 
-	pr_debug("pktgen: %s removing all device\n", t->tsk->comm);
+	pr_debug("%s removing all device\n", t->tsk->comm);
 	pktgen_rem_all_ifs(t);
 
-	pr_debug("pktgen: %s removing thread.\n", t->tsk->comm);
+	pr_debug("%s removing thread\n", t->tsk->comm);
 	pktgen_rem_thread(t);
 
 	return 0;
@@ -3642,7 +3655,7 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
 		}
 
 	if_unlock(t);
-	pr_debug("pktgen: find_dev(%s) returning %p\n", ifname, pkt_dev);
+	pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev);
 	return pkt_dev;
 }
 
@@ -3658,8 +3671,7 @@ static int add_dev_to_thread(struct pktgen_thread *t,
 	if_lock(t);
 
 	if (pkt_dev->pg_thread) {
-		printk(KERN_ERR "pktgen: ERROR: already assigned "
-		       "to a thread.\n");
+		pr_err("ERROR: already assigned to a thread\n");
 		rv = -EBUSY;
 		goto out;
 	}
@@ -3685,7 +3697,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
 
 	pkt_dev = __pktgen_NN_threads(ifname, FIND);
 	if (pkt_dev) {
-		printk(KERN_ERR "pktgen: ERROR: interface already used.\n");
+		pr_err("ERROR: interface already used\n");
 		return -EBUSY;
 	}
 
@@ -3730,7 +3742,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
 	pkt_dev->entry = proc_create_data(ifname, 0600, pg_proc_dir,
 					  &pktgen_if_fops, pkt_dev);
 	if (!pkt_dev->entry) {
-		printk(KERN_ERR "pktgen: cannot create %s/%s procfs entry.\n",
+		pr_err("cannot create %s/%s procfs entry\n",
 		       PG_PROC_DIR, ifname);
 		err = -EINVAL;
 		goto out2;
@@ -3761,8 +3773,7 @@ static int __init pktgen_create_thread(int cpu)
 	t = kzalloc_node(sizeof(struct pktgen_thread), GFP_KERNEL,
 			 cpu_to_node(cpu));
 	if (!t) {
-		printk(KERN_ERR "pktgen: ERROR: out of memory, can't "
-		       "create new thread.\n");
+		pr_err("ERROR: out of memory, can't create new thread\n");
 		return -ENOMEM;
 	}
 
@@ -3776,8 +3787,7 @@ static int __init pktgen_create_thread(int cpu)
 
 	p = kthread_create(pktgen_thread_worker, t, "kpktgend_%d", cpu);
 	if (IS_ERR(p)) {
-		printk(KERN_ERR "pktgen: kernel_thread() failed "
-		       "for cpu %d\n", t->cpu);
+		pr_err("kernel_thread() failed for cpu %d\n", t->cpu);
 		list_del(&t->th_list);
 		kfree(t);
 		return PTR_ERR(p);
@@ -3788,7 +3798,7 @@ static int __init pktgen_create_thread(int cpu)
 	pe = proc_create_data(t->tsk->comm, 0600, pg_proc_dir,
 			      &pktgen_thread_fops, t);
 	if (!pe) {
-		printk(KERN_ERR "pktgen: cannot create %s/%s procfs entry.\n",
+		pr_err("cannot create %s/%s procfs entry\n",
 		       PG_PROC_DIR, t->tsk->comm);
 		kthread_stop(p);
 		list_del(&t->th_list);
@@ -3822,11 +3832,10 @@ static int pktgen_remove_device(struct pktgen_thread *t,
 				struct pktgen_dev *pkt_dev)
 {
 
-	pr_debug("pktgen: remove_device pkt_dev=%p\n", pkt_dev);
+	pr_debug("remove_device pkt_dev=%p\n", pkt_dev);
 
 	if (pkt_dev->running) {
-		printk(KERN_WARNING "pktgen: WARNING: trying to remove a "
-		       "running interface, stopping it now.\n");
+		pr_warning("WARNING: trying to remove a running interface, stopping it now\n");
 		pktgen_stop_device(pkt_dev);
 	}
 
@@ -3857,7 +3866,7 @@ static int __init pg_init(void)
 	int cpu;
 	struct proc_dir_entry *pe;
 
-	printk(KERN_INFO "%s", version);
+	pr_info("%s", version);
 
 	pg_proc_dir = proc_mkdir(PG_PROC_DIR, init_net.proc_net);
 	if (!pg_proc_dir)
@@ -3865,8 +3874,7 @@ static int __init pg_init(void)
 
 	pe = proc_create(PGCTRL, 0600, pg_proc_dir, &pktgen_fops);
 	if (pe == NULL) {
-		printk(KERN_ERR "pktgen: ERROR: cannot create %s "
-		       "procfs entry.\n", PGCTRL);
+		pr_err("ERROR: cannot create %s procfs entry\n", PGCTRL);
 		proc_net_remove(&init_net, PG_PROC_DIR);
 		return -EINVAL;
 	}
@@ -3879,13 +3887,12 @@ static int __init pg_init(void)
 
 		err = pktgen_create_thread(cpu);
 		if (err)
-			printk(KERN_WARNING "pktgen: WARNING: Cannot create "
-			       "thread for cpu %d (%d)\n", cpu, err);
+			pr_warning("WARNING: Cannot create thread for cpu %d (%d)\n",
+				   cpu, err);
 	}
 
 	if (list_empty(&pktgen_threads)) {
-		printk(KERN_ERR "pktgen: ERROR: Initialization failed for "
-		       "all threads\n");
+		pr_err("ERROR: Initialization failed for all threads\n");
 		unregister_netdevice_notifier(&pktgen_notifier_block);
 		remove_proc_entry(PGCTRL, pg_proc_dir);
 		proc_net_remove(&init_net, PG_PROC_DIR);
@@ -3899,8 +3906,6 @@ static void __exit pg_cleanup(void)
 {
 	struct pktgen_thread *t;
 	struct list_head *q, *n;
-	wait_queue_head_t queue;
-	init_waitqueue_head(&queue);
 
 	/* Stop all interfaces & threads */
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1a2af24e9e3..8121268ddbd 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -299,14 +299,6 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
 	unregister_netdevice_many(&list_kill);
 }
 
-void rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
-{
-	rtnl_lock();
-	__rtnl_kill_links(net, ops);
-	rtnl_unlock();
-}
-EXPORT_SYMBOL_GPL(rtnl_kill_links);
-
 /**
  * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
  * @ops: struct rtnl_link_ops * to unregister
@@ -579,7 +571,7 @@ static unsigned int rtnl_dev_combine_flags(const struct net_device *dev,
 }
 
 static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
-				 const struct net_device_stats *b)
+				 const struct rtnl_link_stats64 *b)
 {
 	a->rx_packets = b->rx_packets;
 	a->tx_packets = b->tx_packets;
@@ -610,38 +602,9 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
 	a->tx_compressed = b->tx_compressed;
 }
 
-static void copy_rtnl_link_stats64(void *v, const struct net_device_stats *b)
+static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
 {
-	struct rtnl_link_stats64 a;
-
-	a.rx_packets = b->rx_packets;
-	a.tx_packets = b->tx_packets;
-	a.rx_bytes = b->rx_bytes;
-	a.tx_bytes = b->tx_bytes;
-	a.rx_errors = b->rx_errors;
-	a.tx_errors = b->tx_errors;
-	a.rx_dropped = b->rx_dropped;
-	a.tx_dropped = b->tx_dropped;
-
-	a.multicast = b->multicast;
-	a.collisions = b->collisions;
-
-	a.rx_length_errors = b->rx_length_errors;
-	a.rx_over_errors = b->rx_over_errors;
-	a.rx_crc_errors = b->rx_crc_errors;
-	a.rx_frame_errors = b->rx_frame_errors;
-	a.rx_fifo_errors = b->rx_fifo_errors;
-	a.rx_missed_errors = b->rx_missed_errors;
-
-	a.tx_aborted_errors = b->tx_aborted_errors;
-	a.tx_carrier_errors = b->tx_carrier_errors;
-	a.tx_fifo_errors = b->tx_fifo_errors;
-	a.tx_heartbeat_errors = b->tx_heartbeat_errors;
-	a.tx_window_errors = b->tx_window_errors;
-
-	a.rx_compressed = b->rx_compressed;
-	a.tx_compressed = b->tx_compressed;
-	memcpy(v, &a, sizeof(a));
+	memcpy(v, b, sizeof(*b));
 }
 
 /* All VF info */
@@ -686,7 +649,7 @@ static size_t rtnl_port_size(const struct net_device *dev)
 		return port_self_size;
 }
 
-static inline size_t if_nlmsg_size(const struct net_device *dev)
+static noinline size_t if_nlmsg_size(const struct net_device *dev)
 {
 	return NLMSG_ALIGN(sizeof(struct ifinfomsg))
 	       + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
@@ -791,7 +754,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
-	const struct net_device_stats *stats;
+	struct rtnl_link_stats64 temp;
+	const struct rtnl_link_stats64 *stats;
 	struct nlattr *attr;
 
 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
@@ -847,7 +811,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	if (attr == NULL)
 		goto nla_put_failure;
 
-	stats = dev_get_stats(dev);
+	stats = dev_get_stats(dev, &temp);
 	copy_rtnl_link_stats(nla_data(attr), stats);
 
 	attr = nla_reserve(skb, IFLA_STATS64,
diff --git a/net/core/scm.c b/net/core/scm.c
index b88f6f9d0b9..413cab89017 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -130,6 +130,7 @@ void __scm_destroy(struct scm_cookie *scm)
 		}
 	}
 }
+EXPORT_SYMBOL(__scm_destroy);
 
 int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
 {
@@ -170,6 +171,30 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
 			err = scm_check_creds(&p->creds);
 			if (err)
 				goto error;
+
+			if (pid_vnr(p->pid) != p->creds.pid) {
+				struct pid *pid;
+				err = -ESRCH;
+				pid = find_get_pid(p->creds.pid);
+				if (!pid)
+					goto error;
+				put_pid(p->pid);
+				p->pid = pid;
+			}
+
+			if ((p->cred->euid != p->creds.uid) ||
+				(p->cred->egid != p->creds.gid)) {
+				struct cred *cred;
+				err = -ENOMEM;
+				cred = prepare_creds();
+				if (!cred)
+					goto error;
+
+				cred->uid = cred->euid = p->creds.uid;
+				cred->gid = cred->egid = p->creds.uid;
+				put_cred(p->cred);
+				p->cred = cred;
+			}
 			break;
 		default:
 			goto error;
@@ -187,6 +212,7 @@ error:
 	scm_destroy(p);
 	return err;
 }
+EXPORT_SYMBOL(__scm_send);
 
 int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
 {
@@ -225,6 +251,7 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
 out:
 	return err;
 }
+EXPORT_SYMBOL(put_cmsg);
 
 void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
 {
@@ -294,6 +321,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
 	 */
 	__scm_destroy(scm);
 }
+EXPORT_SYMBOL(scm_detach_fds);
 
 struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
 {
@@ -311,9 +339,4 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
 	}
 	return new_fpl;
 }
-
-EXPORT_SYMBOL(__scm_destroy);
-EXPORT_SYMBOL(__scm_send);
-EXPORT_SYMBOL(put_cmsg);
-EXPORT_SYMBOL(scm_detach_fds);
 EXPORT_SYMBOL(scm_fp_dup);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ce88293a34e..104f8444754 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -202,8 +202,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	skb->data = data;
 	skb_reset_tail_pointer(skb);
 	skb->end = skb->tail + size;
-	kmemcheck_annotate_bitfield(skb, flags1);
-	kmemcheck_annotate_bitfield(skb, flags2);
 #ifdef NET_SKBUFF_DATA_USES_OFFSET
 	skb->mac_header = ~0U;
 #endif
@@ -249,10 +247,9 @@ EXPORT_SYMBOL(__alloc_skb);
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
 		unsigned int length, gfp_t gfp_mask)
 {
-	int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
 	struct sk_buff *skb;
 
-	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
+	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE);
 	if (likely(skb)) {
 		skb_reserve(skb, NET_SKB_PAD);
 		skb->dev = dev;
@@ -261,16 +258,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
 }
 EXPORT_SYMBOL(__netdev_alloc_skb);
 
-struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
-{
-	int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
-	struct page *page;
-
-	page = alloc_pages_node(node, gfp_mask, 0);
-	return page;
-}
-EXPORT_SYMBOL(__netdev_alloc_page);
-
 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
 		int size)
 {
@@ -340,7 +327,7 @@ static void skb_release_data(struct sk_buff *skb)
 				put_page(skb_shinfo(skb)->frags[i].page);
 		}
 
-		if (skb_has_frags(skb))
+		if (skb_has_frag_list(skb))
 			skb_drop_fraglist(skb);
 
 		kfree(skb->head);
@@ -466,6 +453,7 @@ void consume_skb(struct sk_buff *skb)
 		smp_rmb();
 	else if (likely(!atomic_dec_and_test(&skb->users)))
 		return;
+	trace_consume_skb(skb);
 	__kfree_skb(skb);
 }
 EXPORT_SYMBOL(consume_skb);
@@ -685,16 +673,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 
 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
 {
-	int headerlen = skb->data - skb->head;
-	/*
-	 *	Allocate the copy buffer
-	 */
-	struct sk_buff *n;
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
-	n = alloc_skb(skb->end + skb->data_len, gfp_mask);
-#else
-	n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask);
-#endif
+	int headerlen = skb_headroom(skb);
+	unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len;
+	struct sk_buff *n = alloc_skb(size, gfp_mask);
+
 	if (!n)
 		return NULL;
 
@@ -726,20 +708,14 @@ EXPORT_SYMBOL(skb_copy);
 
 struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
 {
-	/*
-	 *	Allocate the copy buffer
-	 */
-	struct sk_buff *n;
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
-	n = alloc_skb(skb->end, gfp_mask);
-#else
-	n = alloc_skb(skb->end - skb->head, gfp_mask);
-#endif
+	unsigned int size = skb_end_pointer(skb) - skb->head;
+	struct sk_buff *n = alloc_skb(size, gfp_mask);
+
 	if (!n)
 		goto out;
 
 	/* Set the data pointer */
-	skb_reserve(n, skb->data - skb->head);
+	skb_reserve(n, skb_headroom(skb));
 	/* Set the tail pointer and length */
 	skb_put(n, skb_headlen(skb));
 	/* Copy the bytes */
@@ -759,7 +735,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
 		skb_shinfo(n)->nr_frags = i;
 	}
 
-	if (skb_has_frags(skb)) {
+	if (skb_has_frag_list(skb)) {
 		skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
 		skb_clone_fraglist(n);
 	}
@@ -791,12 +767,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 {
 	int i;
 	u8 *data;
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
-	int size = nhead + skb->end + ntail;
-#else
-	int size = nhead + (skb->end - skb->head) + ntail;
-#endif
+	int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail;
 	long off;
+	bool fastpath;
 
 	BUG_ON(nhead < 0);
 
@@ -810,23 +783,36 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		goto nodata;
 
 	/* Copy only real data... and, alas, header. This should be
-	 * optimized for the cases when header is void. */
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
-	memcpy(data + nhead, skb->head, skb->tail);
-#else
-	memcpy(data + nhead, skb->head, skb->tail - skb->head);
-#endif
-	memcpy(data + size, skb_end_pointer(skb),
-	       sizeof(struct skb_shared_info));
+	 * optimized for the cases when header is void.
+	 */
+	memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
 
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
-		get_page(skb_shinfo(skb)->frags[i].page);
+	memcpy((struct skb_shared_info *)(data + size),
+	       skb_shinfo(skb),
+	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
 
-	if (skb_has_frags(skb))
-		skb_clone_fraglist(skb);
+	/* Check if we can avoid taking references on fragments if we own
+	 * the last reference on skb->head. (see skb_release_data())
+	 */
+	if (!skb->cloned)
+		fastpath = true;
+	else {
+		int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1;
 
-	skb_release_data(skb);
+		fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
+	}
+
+	if (fastpath) {
+		kfree(skb->head);
+	} else {
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+			get_page(skb_shinfo(skb)->frags[i].page);
 
+		if (skb_has_frag_list(skb))
+			skb_clone_fraglist(skb);
+
+		skb_release_data(skb);
+	}
 	off = (data + nhead) - skb->head;
 
 	skb->head     = data;
@@ -1099,7 +1085,7 @@ drop_pages:
 		for (; i < nfrags; i++)
 			put_page(skb_shinfo(skb)->frags[i].page);
 
-		if (skb_has_frags(skb))
+		if (skb_has_frag_list(skb))
 			skb_drop_fraglist(skb);
 		goto done;
 	}
@@ -1194,7 +1180,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
 	/* Optimization: no fragments, no reasons to preestimate
 	 * size of pulled pages. Superb.
 	 */
-	if (!skb_has_frags(skb))
+	if (!skb_has_frag_list(skb))
 		goto pull_pages;
 
 	/* Estimate size of pulled pages. */
@@ -2323,7 +2309,7 @@ next_skb:
 		st->frag_data = NULL;
 	}
 
-	if (st->root_skb == st->cur_skb && skb_has_frags(st->root_skb)) {
+	if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
 		st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
 		st->frag_idx = 0;
 		goto next_skb;
@@ -2486,7 +2472,6 @@ unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
 	skb_postpull_rcsum(skb, skb->data, len);
 	return skb->data += len;
 }
-
 EXPORT_SYMBOL_GPL(skb_pull_rcsum);
 
 /**
@@ -2574,6 +2559,10 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
 		__copy_skb_header(nskb, skb);
 		nskb->mac_len = skb->mac_len;
 
+		/* nskb and skb might have different headroom */
+		if (nskb->ip_summed == CHECKSUM_PARTIAL)
+			nskb->csum_start += skb_headroom(nskb) - headroom;
+
 		skb_reset_mac_header(nskb);
 		skb_set_network_header(nskb, skb->mac_len);
 		nskb->transport_header = (nskb->network_header +
@@ -2704,7 +2693,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 		return -E2BIG;
 
 	headroom = skb_headroom(p);
-	nskb = netdev_alloc_skb(p->dev, headroom + skb_gro_offset(p));
+	nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC);
 	if (unlikely(!nskb))
 		return -ENOMEM;
 
@@ -2890,7 +2879,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
 		return -ENOMEM;
 
 	/* Easy case. Most of packets will go this way. */
-	if (!skb_has_frags(skb)) {
+	if (!skb_has_frag_list(skb)) {
 		/* A little of trouble, not enough of space for trailer.
 		 * This should not happen, when stack is tuned to generate
 		 * good frames. OK, on miss we reallocate and reserve even more
@@ -2925,7 +2914,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
 
 		if (skb1->next == NULL && tailbits) {
 			if (skb_shinfo(skb1)->nr_frags ||
-			    skb_has_frags(skb1) ||
+			    skb_has_frag_list(skb1) ||
 			    skb_tailroom(skb1) < tailbits)
 				ntail = tailbits + 128;
 		}
@@ -2934,7 +2923,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
 		    skb_cloned(skb1) ||
 		    ntail ||
 		    skb_shinfo(skb1)->nr_frags ||
-		    skb_has_frags(skb1)) {
+		    skb_has_frag_list(skb1)) {
 			struct sk_buff *skb2;
 
 			/* Fuck, we are miserable poor guys... */
@@ -3017,7 +3006,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
 	} else {
 		/*
 		 * no hardware time stamps available,
-		 * so keep the skb_shared_tx and only
+		 * so keep the shared tx_flags and only
 		 * store software time stamp
 		 */
 		skb->tstamp = ktime_get_real();
diff --git a/net/core/sock.c b/net/core/sock.c
index 2cf7f9f7e77..3eed5424e65 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -110,6 +110,7 @@
 #include <linux/tcp.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
+#include <linux/user_namespace.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -156,7 +157,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = {
   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
-  "sk_lock-AF_IEEE802154",
+  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" ,
   "sk_lock-AF_MAX"
 };
 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
@@ -172,7 +173,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
-  "slock-AF_IEEE802154",
+  "slock-AF_IEEE802154", "slock-AF_CAIF" ,
   "slock-AF_MAX"
 };
 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
@@ -188,7 +189,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
-  "clock-AF_IEEE802154",
+  "clock-AF_IEEE802154", "clock-AF_CAIF" ,
   "clock-AF_MAX"
 };
 
@@ -749,6 +750,20 @@ set_rcvbuf:
 EXPORT_SYMBOL(sock_setsockopt);
 
 
+void cred_to_ucred(struct pid *pid, const struct cred *cred,
+		   struct ucred *ucred)
+{
+	ucred->pid = pid_vnr(pid);
+	ucred->uid = ucred->gid = -1;
+	if (cred) {
+		struct user_namespace *current_ns = current_user_ns();
+
+		ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
+		ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
+	}
+}
+EXPORT_SYMBOL_GPL(cred_to_ucred);
+
 int sock_getsockopt(struct socket *sock, int level, int optname,
 		    char __user *optval, int __user *optlen)
 {
@@ -901,11 +916,15 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		break;
 
 	case SO_PEERCRED:
-		if (len > sizeof(sk->sk_peercred))
-			len = sizeof(sk->sk_peercred);
-		if (copy_to_user(optval, &sk->sk_peercred, len))
+	{
+		struct ucred peercred;
+		if (len > sizeof(peercred))
+			len = sizeof(peercred);
+		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
+		if (copy_to_user(optval, &peercred, len))
 			return -EFAULT;
 		goto lenout;
+	}
 
 	case SO_PEERNAME:
 	{
@@ -1059,8 +1078,11 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
 #ifdef CONFIG_CGROUPS
 void sock_update_classid(struct sock *sk)
 {
-	u32 classid = task_cls_classid(current);
+	u32 classid;
 
+	rcu_read_lock();  /* doing current task, which cannot vanish. */
+	classid = task_cls_classid(current);
+	rcu_read_unlock();
 	if (classid && classid != sk->sk_classid)
 		sk->sk_classid = classid;
 }
@@ -1119,6 +1141,9 @@ static void __sk_free(struct sock *sk)
 		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
 		       __func__, atomic_read(&sk->sk_omem_alloc));
 
+	if (sk->sk_peer_cred)
+		put_cred(sk->sk_peer_cred);
+	put_pid(sk->sk_peer_pid);
 	put_net(sock_net(sk));
 	sk_prot_free(sk->sk_prot_creator, sk);
 }
@@ -1200,7 +1225,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 		sock_reset_flag(newsk, SOCK_DONE);
 		skb_queue_head_init(&newsk->sk_error_queue);
 
-		filter = newsk->sk_filter;
+		filter = rcu_dereference_protected(newsk->sk_filter, 1);
 		if (filter != NULL)
 			sk_filter_charge(newsk, filter);
 
@@ -1317,9 +1342,10 @@ EXPORT_SYMBOL(sock_wfree);
 void sock_rfree(struct sk_buff *skb)
 {
 	struct sock *sk = skb->sk;
+	unsigned int len = skb->truesize;
 
-	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
-	sk_mem_uncharge(skb->sk, skb->truesize);
+	atomic_sub(len, &sk->sk_rmem_alloc);
+	sk_mem_uncharge(sk, len);
 }
 EXPORT_SYMBOL(sock_rfree);
 
@@ -1328,9 +1354,9 @@ int sock_i_uid(struct sock *sk)
 {
 	int uid;
 
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 	return uid;
 }
 EXPORT_SYMBOL(sock_i_uid);
@@ -1339,9 +1365,9 @@ unsigned long sock_i_ino(struct sock *sk)
 {
 	unsigned long ino;
 
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 	return ino;
 }
 EXPORT_SYMBOL(sock_i_ino);
@@ -1534,6 +1560,8 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
 EXPORT_SYMBOL(sock_alloc_send_skb);
 
 static void __lock_sock(struct sock *sk)
+	__releases(&sk->sk_lock.slock)
+	__acquires(&sk->sk_lock.slock)
 {
 	DEFINE_WAIT(wait);
 
@@ -1550,6 +1578,8 @@ static void __lock_sock(struct sock *sk)
 }
 
 static void __release_sock(struct sock *sk)
+	__releases(&sk->sk_lock.slock)
+	__acquires(&sk->sk_lock.slock)
 {
 	struct sk_buff *skb = sk->sk_backlog.head;
 
@@ -1954,9 +1984,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_sndmsg_page	=	NULL;
 	sk->sk_sndmsg_off	=	0;
 
-	sk->sk_peercred.pid 	=	0;
-	sk->sk_peercred.uid	=	-1;
-	sk->sk_peercred.gid	=	-1;
+	sk->sk_peer_pid 	=	NULL;
+	sk->sk_peer_cred	=	NULL;
 	sk->sk_write_pending	=	0;
 	sk->sk_rcvlowat		=	1;
 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
@@ -2210,8 +2239,7 @@ static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
 #ifdef CONFIG_NET_NS
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
 {
-	int cpu = smp_processor_id();
-	per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
+	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
 
@@ -2257,7 +2285,7 @@ static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
 
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
 {
-	__get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
+	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
 
diff --git a/net/core/stream.c b/net/core/stream.c
index cc196f42b8d..f5df85dcd20 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -43,7 +43,6 @@ void sk_stream_write_space(struct sock *sk)
 		rcu_read_unlock();
 	}
 }
-
 EXPORT_SYMBOL(sk_stream_write_space);
 
 /**
@@ -81,7 +80,6 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
 	} while (!done);
 	return 0;
 }
-
 EXPORT_SYMBOL(sk_stream_wait_connect);
 
 /**
@@ -109,7 +107,6 @@ void sk_stream_wait_close(struct sock *sk, long timeout)
 		finish_wait(sk_sleep(sk), &wait);
 	}
 }
-
 EXPORT_SYMBOL(sk_stream_wait_close);
 
 /**
@@ -144,10 +141,10 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
 
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 		sk->sk_write_pending++;
-		sk_wait_event(sk, &current_timeo, !sk->sk_err &&
-						  !(sk->sk_shutdown & SEND_SHUTDOWN) &&
-						  sk_stream_memory_free(sk) &&
-						  vm_wait);
+		sk_wait_event(sk, &current_timeo, sk->sk_err ||
+						  (sk->sk_shutdown & SEND_SHUTDOWN) ||
+						  (sk_stream_memory_free(sk) &&
+						  !vm_wait));
 		sk->sk_write_pending--;
 
 		if (vm_wait) {
@@ -174,7 +171,6 @@ do_interrupted:
 	err = sock_intr_errno(*timeo_p);
 	goto out;
 }
-
 EXPORT_SYMBOL(sk_stream_wait_memory);
 
 int sk_stream_error(struct sock *sk, int flags, int err)
@@ -185,7 +181,6 @@ int sk_stream_error(struct sock *sk, int flags, int err)
 		send_sig(SIGPIPE, current, 0);
 	return err;
 }
-
 EXPORT_SYMBOL(sk_stream_error);
 
 void sk_stream_kill_queues(struct sock *sk)
@@ -210,5 +205,4 @@ void sk_stream_kill_queues(struct sock *sk)
 	 * have gone away, only the net layer knows can touch it.
 	 */
 }
-
 EXPORT_SYMBOL(sk_stream_kill_queues);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 01eee5d984b..385b6095fdc 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -34,7 +34,8 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
 
 	mutex_lock(&sock_flow_mutex);
 
-	orig_sock_table = rps_sock_flow_table;
+	orig_sock_table = rcu_dereference_protected(rps_sock_flow_table,
+					lockdep_is_held(&sock_flow_mutex));
 	size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
 
 	ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
new file mode 100644
index 00000000000..0ae6c22da85
--- /dev/null
+++ b/net/core/timestamping.c
@@ -0,0 +1,126 @@
+/*
+ * PTP 1588 clock support - support for timestamping in PHY devices
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/errqueue.h>
+#include <linux/phy.h>
+#include <linux/ptp_classify.h>
+#include <linux/skbuff.h>
+
+static struct sock_filter ptp_filter[] = {
+	PTP_FILTER
+};
+
+static unsigned int classify(struct sk_buff *skb)
+{
+	if (likely(skb->dev &&
+		   skb->dev->phydev &&
+		   skb->dev->phydev->drv))
+		return sk_run_filter(skb, ptp_filter, ARRAY_SIZE(ptp_filter));
+	else
+		return PTP_CLASS_NONE;
+}
+
+void skb_clone_tx_timestamp(struct sk_buff *skb)
+{
+	struct phy_device *phydev;
+	struct sk_buff *clone;
+	struct sock *sk = skb->sk;
+	unsigned int type;
+
+	if (!sk)
+		return;
+
+	type = classify(skb);
+
+	switch (type) {
+	case PTP_CLASS_V1_IPV4:
+	case PTP_CLASS_V1_IPV6:
+	case PTP_CLASS_V2_IPV4:
+	case PTP_CLASS_V2_IPV6:
+	case PTP_CLASS_V2_L2:
+	case PTP_CLASS_V2_VLAN:
+		phydev = skb->dev->phydev;
+		if (likely(phydev->drv->txtstamp)) {
+			clone = skb_clone(skb, GFP_ATOMIC);
+			if (!clone)
+				return;
+			clone->sk = sk;
+			phydev->drv->txtstamp(phydev, clone, type);
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+void skb_complete_tx_timestamp(struct sk_buff *skb,
+			       struct skb_shared_hwtstamps *hwtstamps)
+{
+	struct sock *sk = skb->sk;
+	struct sock_exterr_skb *serr;
+	int err;
+
+	if (!hwtstamps)
+		return;
+
+	*skb_hwtstamps(skb) = *hwtstamps;
+	serr = SKB_EXT_ERR(skb);
+	memset(serr, 0, sizeof(*serr));
+	serr->ee.ee_errno = ENOMSG;
+	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+	skb->sk = NULL;
+	err = sock_queue_err_skb(sk, skb);
+	if (err)
+		kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+
+bool skb_defer_rx_timestamp(struct sk_buff *skb)
+{
+	struct phy_device *phydev;
+	unsigned int type;
+
+	skb_push(skb, ETH_HLEN);
+
+	type = classify(skb);
+
+	skb_pull(skb, ETH_HLEN);
+
+	switch (type) {
+	case PTP_CLASS_V1_IPV4:
+	case PTP_CLASS_V1_IPV6:
+	case PTP_CLASS_V2_IPV4:
+	case PTP_CLASS_V2_IPV6:
+	case PTP_CLASS_V2_L2:
+	case PTP_CLASS_V2_VLAN:
+		phydev = skb->dev->phydev;
+		if (likely(phydev->drv->rxtstamp))
+			return phydev->drv->rxtstamp(phydev, skb, type);
+		break;
+	default:
+		break;
+	}
+
+	return false;
+}
+
+void __init skb_timestamping_init(void)
+{
+	BUG_ON(sk_chk_filter(ptp_filter, ARRAY_SIZE(ptp_filter)));
+}
diff --git a/net/core/utils.c b/net/core/utils.c
index 838250241d2..5fea0ab2190 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -75,9 +75,8 @@ __be32 in_aton(const char *str)
 				str++;
 		}
 	}
-	return(htonl(l));
+	return htonl(l);
 }
-
 EXPORT_SYMBOL(in_aton);
 
 #define IN6PTON_XDIGIT		0x00010000
@@ -93,18 +92,19 @@ EXPORT_SYMBOL(in_aton);
 
 static inline int xdigit2bin(char c, int delim)
 {
+	int val;
+
 	if (c == delim || c == '\0')
 		return IN6PTON_DELIM;
 	if (c == ':')
 		return IN6PTON_COLON_MASK;
 	if (c == '.')
 		return IN6PTON_DOT;
-	if (c >= '0' && c <= '9')
-		return (IN6PTON_XDIGIT | IN6PTON_DIGIT| (c - '0'));
-	if (c >= 'a' && c <= 'f')
-		return (IN6PTON_XDIGIT | (c - 'a' + 10));
-	if (c >= 'A' && c <= 'F')
-		return (IN6PTON_XDIGIT | (c - 'A' + 10));
+
+	val = hex_to_bin(c);
+	if (val >= 0)
+		return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0);
+
 	if (delim == -1)
 		return IN6PTON_DELIM;
 	return IN6PTON_UNKNOWN;
@@ -162,7 +162,6 @@ out:
 		*end = s;
 	return ret;
 }
-
 EXPORT_SYMBOL(in4_pton);
 
 int in6_pton(const char *src, int srclen,
@@ -280,7 +279,6 @@ out:
 		*end = s;
 	return ret;
 }
-
 EXPORT_SYMBOL(in6_pton);
 
 void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 01e4d39fa23..92a6fcb40d7 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -82,7 +82,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	elapsed_time = delta / 10;
 
 	if (elapsed_time != 0 &&
-	    dccp_insert_option_elapsed_time(sk, skb, elapsed_time))
+	    dccp_insert_option_elapsed_time(skb, elapsed_time))
 		return -1;
 
 	avr = dccp_ackvec_record_new();
@@ -201,7 +201,7 @@ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
 						 const unsigned int packets,
 						 const unsigned char state)
 {
-	unsigned int gap;
+	long gap;
 	long new_head;
 
 	if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN)
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 6df6f8ac963..75c3582a767 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -62,22 +62,18 @@ struct ccid_operations {
 	void		(*ccid_hc_tx_exit)(struct sock *sk);
 	void		(*ccid_hc_rx_packet_recv)(struct sock *sk,
 						  struct sk_buff *skb);
-	int		(*ccid_hc_rx_parse_options)(struct sock *sk,
-						    unsigned char option,
-						    unsigned char len, u16 idx,
-						    unsigned char* value);
+	int		(*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
+						    u8 opt, u8 *val, u8 len);
 	int		(*ccid_hc_rx_insert_options)(struct sock *sk,
 						     struct sk_buff *skb);
 	void		(*ccid_hc_tx_packet_recv)(struct sock *sk,
 						  struct sk_buff *skb);
-	int		(*ccid_hc_tx_parse_options)(struct sock *sk,
-						    unsigned char option,
-						    unsigned char len, u16 idx,
-						    unsigned char* value);
+	int		(*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
+						    u8 opt, u8 *val, u8 len);
 	int		(*ccid_hc_tx_send_packet)(struct sock *sk,
 						  struct sk_buff *skb);
 	void		(*ccid_hc_tx_packet_sent)(struct sock *sk,
-						  int more, unsigned int len);
+						  unsigned int len);
 	void		(*ccid_hc_rx_get_info)(struct sock *sk,
 					       struct tcp_info *info);
 	void		(*ccid_hc_tx_get_info)(struct sock *sk,
@@ -138,20 +134,48 @@ static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
 extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
 extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
 
+/*
+ * Congestion control of queued data packets via CCID decision.
+ *
+ * The TX CCID performs its congestion-control by indicating whether and when a
+ * queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
+ * The following modes are supported via the symbolic constants below:
+ * - timer-based pacing    (CCID returns a delay value in milliseconds);
+ * - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
+ */
+
+enum ccid_dequeueing_decision {
+	CCID_PACKET_SEND_AT_ONCE =	 0x00000,  /* "green light": no delay */
+	CCID_PACKET_DELAY_MAX =		 0x0FFFF,  /* maximum delay in msecs  */
+	CCID_PACKET_DELAY =		 0x10000,  /* CCID msec-delay mode */
+	CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000,  /* CCID autonomous mode */
+	CCID_PACKET_ERR =		 0xF0000,  /* error condition */
+};
+
+static inline int ccid_packet_dequeue_eval(const int return_code)
+{
+	if (return_code < 0)
+		return CCID_PACKET_ERR;
+	if (return_code == 0)
+		return CCID_PACKET_SEND_AT_ONCE;
+	if (return_code <= CCID_PACKET_DELAY_MAX)
+		return CCID_PACKET_DELAY;
+	return return_code;
+}
+
 static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
 					 struct sk_buff *skb)
 {
-	int rc = 0;
 	if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
-		rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
-	return rc;
+		return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
+	return CCID_PACKET_SEND_AT_ONCE;
 }
 
 static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
-					  int more, unsigned int len)
+					  unsigned int len)
 {
 	if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
-		ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len);
+		ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
 }
 
 static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
@@ -168,27 +192,31 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
 		ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
 }
 
+/**
+ * ccid_hc_tx_parse_options  -  Parse CCID-specific options sent by the receiver
+ * @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
+ * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
+ * @val: value of @opt
+ * @len: length of @val in bytes
+ */
 static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
-					   unsigned char option,
-					   unsigned char len, u16 idx,
-					   unsigned char* value)
+					   u8 pkt, u8 opt, u8 *val, u8 len)
 {
-	int rc = 0;
-	if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL)
-		rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx,
-						    value);
-	return rc;
+	if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
+		return 0;
+	return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
 }
 
+/**
+ * ccid_hc_rx_parse_options  -  Parse CCID-specific options sent by the sender
+ * Arguments are analogous to ccid_hc_tx_parse_options()
+ */
 static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
-					   unsigned char option,
-					   unsigned char len, u16 idx,
-					   unsigned char* value)
+					   u8 pkt, u8 opt, u8 *val, u8 len)
 {
-	int rc = 0;
-	if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL)
-		rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value);
-	return rc;
+	if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
+		return 0;
+	return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
 }
 
 static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index 8408398cd44..0581143cb80 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -47,37 +47,6 @@ config IP_DCCP_CCID3_DEBUG
 
 	  If in doubt, say N.
 
-config IP_DCCP_CCID3_RTO
-	  int "Use higher bound for nofeedback timer"
-	  default 100
-	  depends on IP_DCCP_CCID3 && EXPERIMENTAL
-	  ---help---
-	    Use higher lower bound for nofeedback timer expiration.
-
-	    The TFRC nofeedback timer normally expires after the maximum of 4
-	    RTTs and twice the current send interval (RFC 3448, 4.3). On LANs
-	    with a small RTT this can mean a high processing load and reduced
-	    performance, since then the nofeedback timer is triggered very
-	    frequently.
-
-	    This option enables to set a higher lower bound for the nofeedback
-	    value. Values in units of milliseconds can be set here.
-
-	    A value of 0 disables this feature by enforcing the value specified
-	    in RFC 3448. The following values have been suggested as bounds for
-	    experimental use:
-		* 16-20ms to match the typical multimedia inter-frame interval
-		* 100ms as a reasonable compromise [default]
-		* 1000ms corresponds to the lower TCP RTO bound (RFC 2988, 2.4)
-
-	    The default of 100ms is a compromise between a large value for
-	    efficient DCCP implementations, and a small value to avoid disrupting
-	    the network in times of congestion.
-
-	    The purpose of the nofeedback timer is to slow DCCP down when there
-	    is serious network congestion: experimenting with larger values should
-	    therefore not be performed on WANs.
-
 config IP_DCCP_TFRC_LIB
 	def_bool y if IP_DCCP_CCID3
 
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 9b3ae9922be..6576eae9e77 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -25,59 +25,14 @@
  */
 #include <linux/slab.h>
 #include "../feat.h"
-#include "../ccid.h"
-#include "../dccp.h"
 #include "ccid2.h"
 
 
 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
 static int ccid2_debug;
 #define ccid2_pr_debug(format, a...)	DCCP_PR_DEBUG(ccid2_debug, format, ##a)
-
-static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hc)
-{
-	int len = 0;
-	int pipe = 0;
-	struct ccid2_seq *seqp = hc->tx_seqh;
-
-	/* there is data in the chain */
-	if (seqp != hc->tx_seqt) {
-		seqp = seqp->ccid2s_prev;
-		len++;
-		if (!seqp->ccid2s_acked)
-			pipe++;
-
-		while (seqp != hc->tx_seqt) {
-			struct ccid2_seq *prev = seqp->ccid2s_prev;
-
-			len++;
-			if (!prev->ccid2s_acked)
-				pipe++;
-
-			/* packets are sent sequentially */
-			BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq,
-						prev->ccid2s_seq ) >= 0);
-			BUG_ON(time_before(seqp->ccid2s_sent,
-					   prev->ccid2s_sent));
-
-			seqp = prev;
-		}
-	}
-
-	BUG_ON(pipe != hc->tx_pipe);
-	ccid2_pr_debug("len of chain=%d\n", len);
-
-	do {
-		seqp = seqp->ccid2s_prev;
-		len++;
-	} while (seqp != hc->tx_seqh);
-
-	ccid2_pr_debug("total len=%d\n", len);
-	BUG_ON(len != hc->tx_seqbufc * CCID2_SEQBUF_LEN);
-}
 #else
 #define ccid2_pr_debug(format, a...)
-#define ccid2_hc_tx_check_sanity(hc)
 #endif
 
 static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc)
@@ -123,12 +78,9 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc)
 
 static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 {
-	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
-
-	if (hc->tx_pipe < hc->tx_cwnd)
-		return 0;
-
-	return 1; /* XXX CCID should dequeue when ready instead of polling */
+	if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk)))
+		return CCID_PACKET_WILL_DEQUEUE_LATER;
+	return CCID_PACKET_SEND_AT_ONCE;
 }
 
 static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
@@ -156,19 +108,11 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
 	dp->dccps_l_ack_ratio = val;
 }
 
-static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hc, long val)
-{
-	ccid2_pr_debug("change SRTT to %ld\n", val);
-	hc->tx_srtt = val;
-}
-
-static void ccid2_start_rto_timer(struct sock *sk);
-
 static void ccid2_hc_tx_rto_expire(unsigned long data)
 {
 	struct sock *sk = (struct sock *)data;
 	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
-	long s;
+	const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
@@ -178,23 +122,17 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
 
 	ccid2_pr_debug("RTO_EXPIRE\n");
 
-	ccid2_hc_tx_check_sanity(hc);
-
 	/* back-off timer */
 	hc->tx_rto <<= 1;
-
-	s = hc->tx_rto / HZ;
-	if (s > 60)
-		hc->tx_rto = 60 * HZ;
-
-	ccid2_start_rto_timer(sk);
+	if (hc->tx_rto > DCCP_RTO_MAX)
+		hc->tx_rto = DCCP_RTO_MAX;
 
 	/* adjust pipe, cwnd etc */
 	hc->tx_ssthresh = hc->tx_cwnd / 2;
 	if (hc->tx_ssthresh < 2)
 		hc->tx_ssthresh = 2;
-	hc->tx_cwnd	 = 1;
-	hc->tx_pipe	 = 0;
+	hc->tx_cwnd	= 1;
+	hc->tx_pipe	= 0;
 
 	/* clear state about stuff we sent */
 	hc->tx_seqt = hc->tx_seqh;
@@ -204,23 +142,18 @@ static void ccid2_hc_tx_rto_expire(unsigned long data)
 	hc->tx_rpseq    = 0;
 	hc->tx_rpdupack = -1;
 	ccid2_change_l_ack_ratio(sk, 1);
-	ccid2_hc_tx_check_sanity(hc);
+
+	/* if we were blocked before, we may now send cwnd=1 packet */
+	if (sender_was_blocked)
+		tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+	/* restart backed-off timer */
+	sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
 out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
 
-static void ccid2_start_rto_timer(struct sock *sk)
-{
-	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
-
-	ccid2_pr_debug("setting RTO timeout=%ld\n", hc->tx_rto);
-
-	BUG_ON(timer_pending(&hc->tx_rtotimer));
-	sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
-}
-
-static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
+static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
@@ -230,7 +163,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
 
 	hc->tx_seqh->ccid2s_seq   = dp->dccps_gss;
 	hc->tx_seqh->ccid2s_acked = 0;
-	hc->tx_seqh->ccid2s_sent  = jiffies;
+	hc->tx_seqh->ccid2s_sent  = ccid2_time_stamp;
 
 	next = hc->tx_seqh->ccid2s_next;
 	/* check if we need to alloc more space */
@@ -296,23 +229,20 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
 	}
 #endif
 
-	/* setup RTO timer */
-	if (!timer_pending(&hc->tx_rtotimer))
-		ccid2_start_rto_timer(sk);
+	sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
 
 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
 	do {
 		struct ccid2_seq *seqp = hc->tx_seqt;
 
 		while (seqp != hc->tx_seqh) {
-			ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n",
+			ccid2_pr_debug("out seq=%llu acked=%d time=%u\n",
 				       (unsigned long long)seqp->ccid2s_seq,
 				       seqp->ccid2s_acked, seqp->ccid2s_sent);
 			seqp = seqp->ccid2s_next;
 		}
 	} while (0);
 	ccid2_pr_debug("=========\n");
-	ccid2_hc_tx_check_sanity(hc);
 #endif
 }
 
@@ -378,17 +308,87 @@ out_invalid_option:
 	return -1;
 }
 
-static void ccid2_hc_tx_kill_rto_timer(struct sock *sk)
+/**
+ * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
+ * This code is almost identical with TCP's tcp_rtt_estimator(), since
+ * - it has a higher sampling frequency (recommended by RFC 1323),
+ * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
+ * - it is simple (cf. more complex proposals such as Eifel timer or research
+ *   which suggests that the gain should be set according to window size),
+ * - in tests it was found to work well with CCID2 [gerrit].
+ */
+static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
 {
 	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+	long m = mrtt ? : 1;
 
-	sk_stop_timer(sk, &hc->tx_rtotimer);
-	ccid2_pr_debug("deleted RTO timer\n");
+	if (hc->tx_srtt == 0) {
+		/* First measurement m */
+		hc->tx_srtt = m << 3;
+		hc->tx_mdev = m << 1;
+
+		hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk));
+		hc->tx_rttvar   = hc->tx_mdev_max;
+
+		hc->tx_rtt_seq  = dccp_sk(sk)->dccps_gss;
+	} else {
+		/* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
+		m -= (hc->tx_srtt >> 3);
+		hc->tx_srtt += m;
+
+		/* Similarly, update scaled mdev with regard to |m| */
+		if (m < 0) {
+			m = -m;
+			m -= (hc->tx_mdev >> 2);
+			/*
+			 * This neutralises RTO increase when RTT < SRTT - mdev
+			 * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
+			 * in Linux TCP", USENIX 2002, pp. 49-62).
+			 */
+			if (m > 0)
+				m >>= 3;
+		} else {
+			m -= (hc->tx_mdev >> 2);
+		}
+		hc->tx_mdev += m;
+
+		if (hc->tx_mdev > hc->tx_mdev_max) {
+			hc->tx_mdev_max = hc->tx_mdev;
+			if (hc->tx_mdev_max > hc->tx_rttvar)
+				hc->tx_rttvar = hc->tx_mdev_max;
+		}
+
+		/*
+		 * Decay RTTVAR at most once per flight, exploiting that
+		 *  1) pipe <= cwnd <= Sequence_Window = W  (RFC 4340, 7.5.2)
+		 *  2) AWL = GSS-W+1 <= GAR <= GSS          (RFC 4340, 7.5.1)
+		 * GAR is a useful bound for FlightSize = pipe.
+		 * AWL is probably too low here, as it over-estimates pipe.
+		 */
+		if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) {
+			if (hc->tx_mdev_max < hc->tx_rttvar)
+				hc->tx_rttvar -= (hc->tx_rttvar -
+						  hc->tx_mdev_max) >> 2;
+			hc->tx_rtt_seq  = dccp_sk(sk)->dccps_gss;
+			hc->tx_mdev_max = tcp_rto_min(sk);
+		}
+	}
+
+	/*
+	 * Set RTO from SRTT and RTTVAR
+	 * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms.
+	 * This agrees with RFC 4341, 5:
+	 *	"Because DCCP does not retransmit data, DCCP does not require
+	 *	 TCP's recommended minimum timeout of one second".
+	 */
+	hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar;
+
+	if (hc->tx_rto > DCCP_RTO_MAX)
+		hc->tx_rto = DCCP_RTO_MAX;
 }
 
-static inline void ccid2_new_ack(struct sock *sk,
-				 struct ccid2_seq *seqp,
-				 unsigned int *maxincr)
+static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
+			  unsigned int *maxincr)
 {
 	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
 
@@ -402,93 +402,27 @@ static inline void ccid2_new_ack(struct sock *sk,
 			hc->tx_cwnd += 1;
 			hc->tx_packets_acked = 0;
 	}
-
-	/* update RTO */
-	if (hc->tx_srtt == -1 ||
-	    time_after(jiffies, hc->tx_lastrtt + hc->tx_srtt)) {
-		unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent;
-		int s;
-
-		/* first measurement */
-		if (hc->tx_srtt == -1) {
-			ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
-				       r, jiffies,
-				       (unsigned long long)seqp->ccid2s_seq);
-			ccid2_change_srtt(hc, r);
-			hc->tx_rttvar = r >> 1;
-		} else {
-			/* RTTVAR */
-			long tmp = hc->tx_srtt - r;
-			long srtt;
-
-			if (tmp < 0)
-				tmp *= -1;
-
-			tmp >>= 2;
-			hc->tx_rttvar *= 3;
-			hc->tx_rttvar >>= 2;
-			hc->tx_rttvar += tmp;
-
-			/* SRTT */
-			srtt = hc->tx_srtt;
-			srtt *= 7;
-			srtt >>= 3;
-			tmp = r >> 3;
-			srtt += tmp;
-			ccid2_change_srtt(hc, srtt);
-		}
-		s = hc->tx_rttvar << 2;
-		/* clock granularity is 1 when based on jiffies */
-		if (!s)
-			s = 1;
-		hc->tx_rto = hc->tx_srtt + s;
-
-		/* must be at least a second */
-		s = hc->tx_rto / HZ;
-		/* DCCP doesn't require this [but I like it cuz my code sux] */
-#if 1
-		if (s < 1)
-			hc->tx_rto = HZ;
-#endif
-		/* max 60 seconds */
-		if (s > 60)
-			hc->tx_rto = HZ * 60;
-
-		hc->tx_lastrtt = jiffies;
-
-		ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
-			       hc->tx_srtt, hc->tx_rttvar,
-			       hc->tx_rto, HZ, r);
-	}
-
-	/* we got a new ack, so re-start RTO timer */
-	ccid2_hc_tx_kill_rto_timer(sk);
-	ccid2_start_rto_timer(sk);
-}
-
-static void ccid2_hc_tx_dec_pipe(struct sock *sk)
-{
-	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
-
-	if (hc->tx_pipe == 0)
-		DCCP_BUG("pipe == 0");
-	else
-		hc->tx_pipe--;
-
-	if (hc->tx_pipe == 0)
-		ccid2_hc_tx_kill_rto_timer(sk);
+	/*
+	 * FIXME: RTT is sampled several times per acknowledgment (for each
+	 * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
+	 * This causes the RTT to be over-estimated, since the older entries
+	 * in the Ack Vector have earlier sending times.
+	 * The cleanest solution is to not use the ccid2s_sent field at all
+	 * and instead use DCCP timestamps: requires changes in other places.
+	 */
+	ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent);
 }
 
 static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
 {
 	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
 
-	if (time_before(seqp->ccid2s_sent, hc->tx_last_cong)) {
+	if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) {
 		ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
 		return;
 	}
 
-	hc->tx_last_cong = jiffies;
+	hc->tx_last_cong = ccid2_time_stamp;
 
 	hc->tx_cwnd      = hc->tx_cwnd / 2 ? : 1U;
 	hc->tx_ssthresh  = max(hc->tx_cwnd, 2U);
@@ -502,6 +436,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+	const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
 	u64 ackno, seqno;
 	struct ccid2_seq *seqp;
 	unsigned char *vector;
@@ -510,7 +445,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	int done = 0;
 	unsigned int maxincr = 0;
 
-	ccid2_hc_tx_check_sanity(hc);
 	/* check reverse path congestion */
 	seqno = DCCP_SKB_CB(skb)->dccpd_seq;
 
@@ -620,7 +554,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 					seqp->ccid2s_acked = 1;
 					ccid2_pr_debug("Got ack for %llu\n",
 						       (unsigned long long)seqp->ccid2s_seq);
-					ccid2_hc_tx_dec_pipe(sk);
+					hc->tx_pipe--;
 				}
 				if (seqp == hc->tx_seqt) {
 					done = 1;
@@ -677,7 +611,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 				 * one ack vector.
 				 */
 				ccid2_congestion_event(sk, seqp);
-				ccid2_hc_tx_dec_pipe(sk);
+				hc->tx_pipe--;
 			}
 			if (seqp == hc->tx_seqt)
 				break;
@@ -695,7 +629,15 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 		hc->tx_seqt = hc->tx_seqt->ccid2s_next;
 	}
 
-	ccid2_hc_tx_check_sanity(hc);
+	/* restart RTO timer if not all outstanding data has been acked */
+	if (hc->tx_pipe == 0)
+		sk_stop_timer(sk, &hc->tx_rtotimer);
+	else
+		sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
+
+	/* check if incoming Acks allow pending packets to be sent */
+	if (sender_was_blocked && !ccid2_cwnd_network_limited(hc))
+		tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
 }
 
 static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -707,12 +649,8 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
 	/* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
 	hc->tx_ssthresh = ~0U;
 
-	/*
-	 * RFC 4341, 5: "The cwnd parameter is initialized to at most four
-	 * packets for new connections, following the rules from [RFC3390]".
-	 * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
-	 */
-	hc->tx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U);
+	/* Use larger initial windows (RFC 4341, section 5). */
+	hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
 
 	/* Make sure that Ack Ratio is enabled and within bounds. */
 	max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2);
@@ -723,15 +661,11 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
 	if (ccid2_hc_tx_alloc_seq(hc))
 		return -ENOMEM;
 
-	hc->tx_rto	 = 3 * HZ;
-	ccid2_change_srtt(hc, -1);
-	hc->tx_rttvar    = -1;
+	hc->tx_rto	 = DCCP_TIMEOUT_INIT;
 	hc->tx_rpdupack  = -1;
-	hc->tx_last_cong = jiffies;
+	hc->tx_last_cong = ccid2_time_stamp;
 	setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire,
 			(unsigned long)sk);
-
-	ccid2_hc_tx_check_sanity(hc);
 	return 0;
 }
 
@@ -740,7 +674,7 @@ static void ccid2_hc_tx_exit(struct sock *sk)
 	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
 	int i;
 
-	ccid2_hc_tx_kill_rto_timer(sk);
+	sk_stop_timer(sk, &hc->tx_rtotimer);
 
 	for (i = 0; i < hc->tx_seqbufc; i++)
 		kfree(hc->tx_seqbuf[i]);
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 1ec6a30103b..25cb6b216ed 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -18,18 +18,23 @@
 #ifndef _DCCP_CCID2_H_
 #define _DCCP_CCID2_H_
 
-#include <linux/dccp.h>
 #include <linux/timer.h>
 #include <linux/types.h>
 #include "../ccid.h"
+#include "../dccp.h"
+
+/*
+ * CCID-2 timestamping faces the same issues as TCP timestamping.
+ * Hence we reuse/share as much of the code as possible.
+ */
+#define ccid2_time_stamp	tcp_time_stamp
+
 /* NUMDUPACK parameter from RFC 4341, p. 6 */
 #define NUMDUPACK	3
 
-struct sock;
-
 struct ccid2_seq {
 	u64			ccid2s_seq;
-	unsigned long		ccid2s_sent;
+	u32			ccid2s_sent;
 	int			ccid2s_acked;
 	struct ccid2_seq	*ccid2s_prev;
 	struct ccid2_seq	*ccid2s_next;
@@ -42,7 +47,12 @@ struct ccid2_seq {
  * struct ccid2_hc_tx_sock - CCID2 TX half connection
  * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
  * @tx_packets_acked:	     Ack counter for deriving cwnd growth (RFC 3465)
- * @tx_lastrtt:		     time RTT was last measured
+ * @tx_srtt:		     smoothed RTT estimate, scaled by 2^3
+ * @tx_mdev:		     smoothed RTT variation, scaled by 2^2
+ * @tx_mdev_max:	     maximum of @mdev during one flight
+ * @tx_rttvar:		     moving average/maximum of @mdev_max
+ * @tx_rto:		     RTO value deriving from SRTT and RTTVAR (RFC 2988)
+ * @tx_rtt_seq:		     to decay RTTVAR at most once per flight
  * @tx_rpseq:		     last consecutive seqno
  * @tx_rpdupack:	     dupacks since rpseq
  */
@@ -55,17 +65,27 @@ struct ccid2_hc_tx_sock {
 	int			tx_seqbufc;
 	struct ccid2_seq	*tx_seqh;
 	struct ccid2_seq	*tx_seqt;
-	long			tx_rto;
-	long			tx_srtt;
-	long			tx_rttvar;
-	unsigned long		tx_lastrtt;
+
+	/* RTT measurement: variables/principles are the same as in TCP */
+	u32			tx_srtt,
+				tx_mdev,
+				tx_mdev_max,
+				tx_rttvar,
+				tx_rto;
+	u64			tx_rtt_seq:48;
 	struct timer_list	tx_rtotimer;
+
 	u64			tx_rpseq;
 	int			tx_rpdupack;
-	unsigned long		tx_last_cong;
+	u32			tx_last_cong;
 	u64			tx_high_ack;
 };
 
+static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc)
+{
+	return hc->tx_pipe >= hc->tx_cwnd;
+}
+
 struct ccid2_hc_rx_sock {
 	int	rx_data;
 };
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index d3235899c7e..3d604e1349c 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -54,7 +54,6 @@ static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
 	[TFRC_SSTATE_NO_SENT]  = "NO_SENT",
 	[TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
 	[TFRC_SSTATE_FBACK]    = "FBACK",
-	[TFRC_SSTATE_TERM]     = "TERM",
 	};
 
 	return ccid3_state_names[state];
@@ -91,19 +90,16 @@ static inline u64 rfc3390_initial_rate(struct sock *sk)
 	return scaled_div(w_init << 6, hc->tx_rtt);
 }
 
-/*
- * Recalculate t_ipi and delta (should be called whenever X changes)
+/**
+ * ccid3_update_send_interval  -  Calculate new t_ipi = s / X_inst
+ * This respects the granularity of X_inst (64 * bytes/second).
  */
 static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc)
 {
-	/* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
 	hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x);
 
-	/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
-	hc->tx_delta = min_t(u32, hc->tx_t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN);
-
-	ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", hc->tx_t_ipi,
-		       hc->tx_delta, hc->tx_s, (unsigned)(hc->tx_x >> 6));
+	ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi,
+		       hc->tx_s, (unsigned)(hc->tx_x >> 6));
 }
 
 static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now)
@@ -211,16 +207,19 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
 	ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk,
 		       ccid3_tx_state_name(hc->tx_state));
 
+	/* Ignore and do not restart after leaving the established state */
+	if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
+		goto out;
+
+	/* Reset feedback state to "no feedback received" */
 	if (hc->tx_state == TFRC_SSTATE_FBACK)
 		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
-	else if (hc->tx_state != TFRC_SSTATE_NO_FBACK)
-		goto out;
 
 	/*
 	 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
+	 * RTO is 0 if and only if no feedback has been received yet.
 	 */
-	if (hc->tx_t_rto == 0 ||	/* no feedback received yet */
-	    hc->tx_p == 0) {
+	if (hc->tx_t_rto == 0 || hc->tx_p == 0) {
 
 		/* halve send rate directly */
 		hc->tx_x = max(hc->tx_x / 2,
@@ -256,7 +255,7 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
 	 * Set new timeout for the nofeedback timer.
 	 * See comments in packet_recv() regarding the value of t_RTO.
 	 */
-	if (unlikely(hc->tx_t_rto == 0))	/* no feedback yet */
+	if (unlikely(hc->tx_t_rto == 0))	/* no feedback received yet */
 		t_nfb = TFRC_INITIAL_TIMEOUT;
 	else
 		t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
@@ -269,11 +268,11 @@ out:
 	sock_put(sk);
 }
 
-/*
- * returns
- *   > 0: delay (in msecs) that should pass before actually sending
- *   = 0: can send immediately
- *   < 0: error condition; do not send packet
+/**
+ * ccid3_hc_tx_send_packet  -  Delay-based dequeueing of TX packets
+ * @skb: next packet candidate to send on @sk
+ * This function uses the convention of ccid_packet_dequeue_eval() and
+ * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
  */
 static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 {
@@ -290,8 +289,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 	if (unlikely(skb->len == 0))
 		return -EBADMSG;
 
-	switch (hc->tx_state) {
-	case TFRC_SSTATE_NO_SENT:
+	if (hc->tx_state == TFRC_SSTATE_NO_SENT) {
 		sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies +
 			       usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
 		hc->tx_last_win_count	= 0;
@@ -326,27 +324,22 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 		ccid3_update_send_interval(hc);
 
 		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
-		break;
-	case TFRC_SSTATE_NO_FBACK:
-	case TFRC_SSTATE_FBACK:
+
+	} else {
 		delay = ktime_us_delta(hc->tx_t_nom, now);
 		ccid3_pr_debug("delay=%ld\n", (long)delay);
 		/*
-		 *	Scheduling of packet transmissions [RFC 3448, 4.6]
+		 *	Scheduling of packet transmissions (RFC 5348, 8.3)
 		 *
 		 * if (t_now > t_nom - delta)
 		 *       // send the packet now
 		 * else
 		 *       // send the packet in (t_nom - t_now) milliseconds.
 		 */
-		if (delay - (s64)hc->tx_delta >= 1000)
-			return (u32)delay / 1000L;
+		if (delay >= TFRC_T_DELTA)
+			return (u32)delay / USEC_PER_MSEC;
 
 		ccid3_hc_tx_update_win_count(hc, now);
-		break;
-	case TFRC_SSTATE_TERM:
-		DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
-		return -EINVAL;
 	}
 
 	/* prepare to send now (add options etc.) */
@@ -355,11 +348,10 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 
 	/* set the nominal send time for the next following packet */
 	hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi);
-	return 0;
+	return CCID_PACKET_SEND_AT_ONCE;
 }
 
-static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
-				    unsigned int len)
+static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 {
 	struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
 
@@ -372,48 +364,34 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
 static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
-	struct ccid3_options_received *opt_recv;
+	struct tfrc_tx_hist_entry *acked;
 	ktime_t now;
 	unsigned long t_nfb;
-	u32 pinv, r_sample;
+	u32 r_sample;
 
 	/* we are only interested in ACKs */
 	if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
 	      DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
 		return;
-	/* ... and only in the established state */
-	if (hc->tx_state != TFRC_SSTATE_FBACK &&
-	    hc->tx_state != TFRC_SSTATE_NO_FBACK)
-		return;
-
-	opt_recv = &hc->tx_options_received;
-	now = ktime_get_real();
-
-	/* Estimate RTT from history if ACK number is valid */
-	r_sample = tfrc_tx_hist_rtt(hc->tx_hist,
-				    DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
-	if (r_sample == 0) {
-		DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
-			  dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
-			  (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
-		return;
-	}
-
-	/* Update receive rate in units of 64 * bytes/second */
-	hc->tx_x_recv = opt_recv->ccid3or_receive_rate;
-	hc->tx_x_recv <<= 6;
-
-	/* Update loss event rate (which is scaled by 1e6) */
-	pinv = opt_recv->ccid3or_loss_event_rate;
-	if (pinv == ~0U || pinv == 0)	       /* see RFC 4342, 8.5   */
-		hc->tx_p = 0;
-	else				       /* can not exceed 100% */
-		hc->tx_p = scaled_div(1, pinv);
 	/*
-	 * Validate new RTT sample and update moving average
+	 * Locate the acknowledged packet in the TX history.
+	 *
+	 * Returning "entry not found" here can for instance happen when
+	 *  - the host has not sent out anything (e.g. a passive server),
+	 *  - the Ack is outdated (packet with higher Ack number was received),
+	 *  - it is a bogus Ack (for a packet not sent on this connection).
 	 */
-	r_sample = dccp_sample_rtt(sk, r_sample);
+	acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb));
+	if (acked == NULL)
+		return;
+	/* For the sake of RTT sampling, ignore/remove all older entries */
+	tfrc_tx_hist_purge(&acked->next);
+
+	/* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
+	now	  = ktime_get_real();
+	r_sample  = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
 	hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9);
+
 	/*
 	 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
 	 */
@@ -461,13 +439,12 @@ done_computing_x:
 	sk->sk_write_space(sk);
 
 	/*
-	 * Update timeout interval for the nofeedback timer.
-	 * We use a configuration option to increase the lower bound.
-	 * This can help avoid triggering the nofeedback timer too
-	 * often ('spinning') on LANs with small RTTs.
+	 * Update timeout interval for the nofeedback timer. In order to control
+	 * rate halving on networks with very low RTTs (<= 1 ms), use per-route
+	 * tunable RTAX_RTO_MIN value as the lower bound.
 	 */
-	hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, (CONFIG_IP_DCCP_CCID3_RTO *
-						       (USEC_PER_SEC / 1000)));
+	hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt,
+				  USEC_PER_SEC/HZ * tcp_rto_min(sk));
 	/*
 	 * Schedule no feedback timer to expire in
 	 * max(t_RTO, 2 * s/X)  =  max(t_RTO, 2 * t_ipi)
@@ -482,66 +459,41 @@ done_computing_x:
 			   jiffies + usecs_to_jiffies(t_nfb));
 }
 
-static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
-				     unsigned char len, u16 idx,
-				     unsigned char *value)
+static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
+				     u8 option, u8 *optval, u8 optlen)
 {
-	int rc = 0;
-	const struct dccp_sock *dp = dccp_sk(sk);
 	struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
-	struct ccid3_options_received *opt_recv;
 	__be32 opt_val;
 
-	opt_recv = &hc->tx_options_received;
-
-	if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
-		opt_recv->ccid3or_seqno		     = dp->dccps_gsr;
-		opt_recv->ccid3or_loss_event_rate    = ~0;
-		opt_recv->ccid3or_loss_intervals_idx = 0;
-		opt_recv->ccid3or_loss_intervals_len = 0;
-		opt_recv->ccid3or_receive_rate	     = 0;
-	}
-
 	switch (option) {
+	case TFRC_OPT_RECEIVE_RATE:
 	case TFRC_OPT_LOSS_EVENT_RATE:
-		if (unlikely(len != 4)) {
-			DCCP_WARN("%s(%p), invalid len %d "
-				  "for TFRC_OPT_LOSS_EVENT_RATE\n",
-				  dccp_role(sk), sk, len);
-			rc = -EINVAL;
-		} else {
-			opt_val = get_unaligned((__be32 *)value);
-			opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
-			ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
-				       dccp_role(sk), sk,
-				       opt_recv->ccid3or_loss_event_rate);
+		/* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
+		if (packet_type == DCCP_PKT_DATA)
+			break;
+		if (unlikely(optlen != 4)) {
+			DCCP_WARN("%s(%p), invalid len %d for %u\n",
+				  dccp_role(sk), sk, optlen, option);
+			return -EINVAL;
 		}
-		break;
-	case TFRC_OPT_LOSS_INTERVALS:
-		opt_recv->ccid3or_loss_intervals_idx = idx;
-		opt_recv->ccid3or_loss_intervals_len = len;
-		ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n",
-			       dccp_role(sk), sk,
-			       opt_recv->ccid3or_loss_intervals_idx,
-			       opt_recv->ccid3or_loss_intervals_len);
-		break;
-	case TFRC_OPT_RECEIVE_RATE:
-		if (unlikely(len != 4)) {
-			DCCP_WARN("%s(%p), invalid len %d "
-				  "for TFRC_OPT_RECEIVE_RATE\n",
-				  dccp_role(sk), sk, len);
-			rc = -EINVAL;
-		} else {
-			opt_val = get_unaligned((__be32 *)value);
-			opt_recv->ccid3or_receive_rate = ntohl(opt_val);
+		opt_val = ntohl(get_unaligned((__be32 *)optval));
+
+		if (option == TFRC_OPT_RECEIVE_RATE) {
+			/* Receive Rate is kept in units of 64 bytes/second */
+			hc->tx_x_recv = opt_val;
+			hc->tx_x_recv <<= 6;
+
 			ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
-				       dccp_role(sk), sk,
-				       opt_recv->ccid3or_receive_rate);
+				       dccp_role(sk), sk, opt_val);
+		} else {
+			/* Update the fixpoint Loss Event Rate fraction */
+			hc->tx_p = tfrc_invert_loss_event_rate(opt_val);
+
+			ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
+				       dccp_role(sk), sk, opt_val);
 		}
-		break;
 	}
-
-	return rc;
+	return 0;
 }
 
 static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -559,42 +511,36 @@ static void ccid3_hc_tx_exit(struct sock *sk)
 {
 	struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
 
-	ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
 	sk_stop_timer(sk, &hc->tx_no_feedback_timer);
-
 	tfrc_tx_hist_purge(&hc->tx_hist);
 }
 
 static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
 {
-	struct ccid3_hc_tx_sock *hc;
-
-	/* Listen socks doesn't have a private CCID block */
-	if (sk->sk_state == DCCP_LISTEN)
-		return;
-
-	hc = ccid3_hc_tx_sk(sk);
-	info->tcpi_rto = hc->tx_t_rto;
-	info->tcpi_rtt = hc->tx_rtt;
+	info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto;
+	info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt;
 }
 
 static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
 				  u32 __user *optval, int __user *optlen)
 {
-	const struct ccid3_hc_tx_sock *hc;
+	const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+	struct tfrc_tx_info tfrc;
 	const void *val;
 
-	/* Listen socks doesn't have a private CCID block */
-	if (sk->sk_state == DCCP_LISTEN)
-		return -EINVAL;
-
-	hc = ccid3_hc_tx_sk(sk);
 	switch (optname) {
 	case DCCP_SOCKOPT_CCID_TX_INFO:
-		if (len < sizeof(hc->tx_tfrc))
+		if (len < sizeof(tfrc))
 			return -EINVAL;
-		len = sizeof(hc->tx_tfrc);
-		val = &hc->tx_tfrc;
+		tfrc.tfrctx_x	   = hc->tx_x;
+		tfrc.tfrctx_x_recv = hc->tx_x_recv;
+		tfrc.tfrctx_x_calc = hc->tx_x_calc;
+		tfrc.tfrctx_rtt	   = hc->tx_rtt;
+		tfrc.tfrctx_p	   = hc->tx_p;
+		tfrc.tfrctx_rto	   = hc->tx_t_rto;
+		tfrc.tfrctx_ipi	   = hc->tx_t_ipi;
+		len = sizeof(tfrc);
+		val = &tfrc;
 		break;
 	default:
 		return -ENOPROTOOPT;
@@ -624,7 +570,6 @@ static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
 	static const char *const ccid3_rx_state_names[] = {
 	[TFRC_RSTATE_NO_DATA] = "NO_DATA",
 	[TFRC_RSTATE_DATA]    = "DATA",
-	[TFRC_RSTATE_TERM]    = "TERM",
 	};
 
 	return ccid3_rx_state_names[state];
@@ -650,14 +595,9 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 {
 	struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
 	struct dccp_sock *dp = dccp_sk(sk);
-	ktime_t now;
+	ktime_t now = ktime_get_real();
 	s64 delta = 0;
 
-	if (unlikely(hc->rx_state == TFRC_RSTATE_TERM))
-		return;
-
-	now = ktime_get_real();
-
 	switch (fbtype) {
 	case CCID3_FBACK_INITIAL:
 		hc->rx_x_recv = 0;
@@ -701,23 +641,21 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk,
 
 static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
 {
-	const struct ccid3_hc_rx_sock *hc;
+	const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
 	__be32 x_recv, pinv;
 
 	if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
 		return 0;
 
-	hc = ccid3_hc_rx_sk(sk);
-
 	if (dccp_packet_without_ack(skb))
 		return 0;
 
 	x_recv = htonl(hc->rx_x_recv);
 	pinv   = htonl(hc->rx_pinv);
 
-	if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
+	if (dccp_insert_option(skb, TFRC_OPT_LOSS_EVENT_RATE,
 			       &pinv, sizeof(pinv)) ||
-	    dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE,
+	    dccp_insert_option(skb, TFRC_OPT_RECEIVE_RATE,
 			       &x_recv, sizeof(x_recv)))
 		return -1;
 
@@ -749,10 +687,11 @@ static u32 ccid3_first_li(struct sock *sk)
 	x_recv = scaled_div32(hc->rx_bytes_recv, delta);
 	if (x_recv == 0) {		/* would also trigger divide-by-zero */
 		DCCP_WARN("X_recv==0\n");
-		if ((x_recv = hc->rx_x_recv) == 0) {
+		if (hc->rx_x_recv == 0) {
 			DCCP_BUG("stored value of X_recv is zero");
 			return ~0U;
 		}
+		x_recv = hc->rx_x_recv;
 	}
 
 	fval = scaled_div(hc->rx_s, hc->rx_rtt);
@@ -862,46 +801,31 @@ static void ccid3_hc_rx_exit(struct sock *sk)
 {
 	struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
 
-	ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
-
 	tfrc_rx_hist_purge(&hc->rx_hist);
 	tfrc_lh_cleanup(&hc->rx_li_hist);
 }
 
 static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
 {
-	const struct ccid3_hc_rx_sock *hc;
-
-	/* Listen socks doesn't have a private CCID block */
-	if (sk->sk_state == DCCP_LISTEN)
-		return;
-
-	hc = ccid3_hc_rx_sk(sk);
-	info->tcpi_ca_state = hc->rx_state;
+	info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state;
 	info->tcpi_options  |= TCPI_OPT_TIMESTAMPS;
-	info->tcpi_rcv_rtt  = hc->rx_rtt;
+	info->tcpi_rcv_rtt  = ccid3_hc_rx_sk(sk)->rx_rtt;
 }
 
 static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
 				  u32 __user *optval, int __user *optlen)
 {
-	const struct ccid3_hc_rx_sock *hc;
+	const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
 	struct tfrc_rx_info rx_info;
 	const void *val;
 
-	/* Listen socks doesn't have a private CCID block */
-	if (sk->sk_state == DCCP_LISTEN)
-		return -EINVAL;
-
-	hc = ccid3_hc_rx_sk(sk);
 	switch (optname) {
 	case DCCP_SOCKOPT_CCID_RX_INFO:
 		if (len < sizeof(rx_info))
 			return -EINVAL;
 		rx_info.tfrcrx_x_recv = hc->rx_x_recv;
 		rx_info.tfrcrx_rtt    = hc->rx_rtt;
-		rx_info.tfrcrx_p      = hc->rx_pinv == 0 ? ~0U :
-					   scaled_div(1, hc->rx_pinv);
+		rx_info.tfrcrx_p      = tfrc_invert_loss_event_rate(hc->rx_pinv);
 		len = sizeof(rx_info);
 		val = &rx_info;
 		break;
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 03263577665..1a9933c2967 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -42,35 +42,36 @@
 #include "lib/tfrc.h"
 #include "../ccid.h"
 
-/* Two seconds as per RFC 3448 4.2 */
+/* Two seconds as per RFC 5348, 4.2 */
 #define TFRC_INITIAL_TIMEOUT	   (2 * USEC_PER_SEC)
 
-/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
-#define TFRC_OPSYS_HALF_TIME_GRAN  (USEC_PER_SEC / (2 * HZ))
-
 /* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
 #define TFRC_T_MBI		   64
 
+/*
+ * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are
+ * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
+ * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
+ * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
+ * when using the constant t_delta  =  t_gran / 2  =  %USEC_PER_SEC / (2 * HZ).
+ */
+#if (HZ >= 500)
+# define TFRC_T_DELTA		   USEC_PER_MSEC
+#else
+# define TFRC_T_DELTA		   (USEC_PER_SEC / (2 * HZ))
+#endif
+
 enum ccid3_options {
 	TFRC_OPT_LOSS_EVENT_RATE = 192,
 	TFRC_OPT_LOSS_INTERVALS	 = 193,
 	TFRC_OPT_RECEIVE_RATE	 = 194,
 };
 
-struct ccid3_options_received {
-	u64 ccid3or_seqno:48,
-	    ccid3or_loss_intervals_idx:16;
-	u16 ccid3or_loss_intervals_len;
-	u32 ccid3or_loss_event_rate;
-	u32 ccid3or_receive_rate;
-};
-
 /* TFRC sender states */
 enum ccid3_hc_tx_states {
 	TFRC_SSTATE_NO_SENT = 1,
 	TFRC_SSTATE_NO_FBACK,
 	TFRC_SSTATE_FBACK,
-	TFRC_SSTATE_TERM,
 };
 
 /**
@@ -90,19 +91,16 @@ enum ccid3_hc_tx_states {
  * @tx_no_feedback_timer: Handle to no feedback timer
  * @tx_t_ld:		  Time last doubled during slow start
  * @tx_t_nom:		  Nominal send time of next packet
- * @tx_delta:		  Send timer delta (RFC 3448, 4.6) in usecs
  * @tx_hist:		  Packet history
- * @tx_options_received:  Parsed set of retrieved options
  */
 struct ccid3_hc_tx_sock {
-	struct tfrc_tx_info		tx_tfrc;
-#define tx_x				tx_tfrc.tfrctx_x
-#define tx_x_recv			tx_tfrc.tfrctx_x_recv
-#define tx_x_calc			tx_tfrc.tfrctx_x_calc
-#define tx_rtt				tx_tfrc.tfrctx_rtt
-#define tx_p				tx_tfrc.tfrctx_p
-#define tx_t_rto			tx_tfrc.tfrctx_rto
-#define tx_t_ipi			tx_tfrc.tfrctx_ipi
+	u64				tx_x;
+	u64				tx_x_recv;
+	u32				tx_x_calc;
+	u32				tx_rtt;
+	u32				tx_p;
+	u32				tx_t_rto;
+	u32				tx_t_ipi;
 	u16				tx_s;
 	enum ccid3_hc_tx_states		tx_state:8;
 	u8				tx_last_win_count;
@@ -110,9 +108,7 @@ struct ccid3_hc_tx_sock {
 	struct timer_list		tx_no_feedback_timer;
 	ktime_t				tx_t_ld;
 	ktime_t				tx_t_nom;
-	u32				tx_delta;
 	struct tfrc_tx_hist_entry	*tx_hist;
-	struct ccid3_options_received	tx_options_received;
 };
 
 static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
@@ -126,21 +122,16 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
 enum ccid3_hc_rx_states {
 	TFRC_RSTATE_NO_DATA = 1,
 	TFRC_RSTATE_DATA,
-	TFRC_RSTATE_TERM    = 127,
 };
 
 /**
  * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
- * @rx_x_recv:		     Receiver estimate of send rate (RFC 3448 4.3)
- * @rx_rtt:		     Receiver estimate of rtt (non-standard)
- * @rx_p:		     Current loss event rate (RFC 3448 5.4)
  * @rx_last_counter:	     Tracks window counter (RFC 4342, 8.1)
  * @rx_state:		     Receiver state, one of %ccid3_hc_rx_states
  * @rx_bytes_recv:	     Total sum of DCCP payload bytes
  * @rx_x_recv:		     Receiver estimate of send rate (RFC 3448, sec. 4.3)
  * @rx_rtt:		     Receiver estimate of RTT
  * @rx_tstamp_last_feedback: Time at which last feedback was sent
- * @rx_tstamp_last_ack:	     Time at which last feedback was sent
  * @rx_hist:		     Packet history (loss detection + RTT sampling)
  * @rx_li_hist:		     Loss Interval database
  * @rx_s:		     Received packet size in bytes
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index 8fc3cbf7907..497723c4d4b 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -116,7 +116,7 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
 	cur->li_length = len;
 	tfrc_lh_calc_i_mean(lh);
 
-	return (lh->i_mean < old_i_mean);
+	return lh->i_mean < old_i_mean;
 }
 
 /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 3a4f414e94a..de8fe294bf0 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -38,18 +38,6 @@
 #include "packet_history.h"
 #include "../../dccp.h"
 
-/**
- *  tfrc_tx_hist_entry  -  Simple singly-linked TX history list
- *  @next:  next oldest entry (LIFO order)
- *  @seqno: sequence number of this entry
- *  @stamp: send time of packet with sequence number @seqno
- */
-struct tfrc_tx_hist_entry {
-	struct tfrc_tx_hist_entry *next;
-	u64			  seqno;
-	ktime_t			  stamp;
-};
-
 /*
  * Transmitter History Routines
  */
@@ -71,15 +59,6 @@ void tfrc_tx_packet_history_exit(void)
 	}
 }
 
-static struct tfrc_tx_hist_entry *
-	tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
-{
-	while (head != NULL && head->seqno != seqno)
-		head = head->next;
-
-	return head;
-}
-
 int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
 {
 	struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
@@ -107,24 +86,6 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
 	*headp = NULL;
 }
 
-u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno,
-		     const ktime_t now)
-{
-	u32 rtt = 0;
-	struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno);
-
-	if (packet != NULL) {
-		rtt = ktime_us_delta(now, packet->stamp);
-		/*
-		 * Garbage-collect older (irrelevant) entries:
-		 */
-		tfrc_tx_hist_purge(&packet->next);
-	}
-
-	return rtt;
-}
-
-
 /*
  *	Receiver History Routines
  */
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 7df6c529999..7ee4a9d9d33 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -40,12 +40,28 @@
 #include <linux/slab.h>
 #include "tfrc.h"
 
-struct tfrc_tx_hist_entry;
+/**
+ *  tfrc_tx_hist_entry  -  Simple singly-linked TX history list
+ *  @next:  next oldest entry (LIFO order)
+ *  @seqno: sequence number of this entry
+ *  @stamp: send time of packet with sequence number @seqno
+ */
+struct tfrc_tx_hist_entry {
+	struct tfrc_tx_hist_entry *next;
+	u64			  seqno;
+	ktime_t			  stamp;
+};
+
+static inline struct tfrc_tx_hist_entry *
+	tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
+{
+	while (head != NULL && head->seqno != seqno)
+		head = head->next;
+	return head;
+}
 
 extern int  tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
 extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
-extern u32  tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head,
-			     const u64 seqno, const ktime_t now);
 
 /* Subtraction a-b modulo-16, respects circular wrap-around */
 #define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index 01bb48e96c2..f8ee3f54977 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -57,6 +57,7 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
 
 extern u32  tfrc_calc_x(u16 s, u32 R, u32 p);
 extern u32  tfrc_calc_x_reverse_lookup(u32 fvalue);
+extern u32  tfrc_invert_loss_event_rate(u32 loss_event_rate);
 
 extern int  tfrc_tx_packet_history_init(void);
 extern void tfrc_tx_packet_history_exit(void);
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 22ca1cf0eb5..a052a4377e2 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -687,3 +687,17 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
 	index = tfrc_binsearch(fvalue, 0);
 	return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
 }
+
+/**
+ * tfrc_invert_loss_event_rate  -  Compute p so that 10^6 corresponds to 100%
+ * When @loss_event_rate is large, there is a chance that p is truncated to 0.
+ * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
+ */
+u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
+{
+	if (loss_event_rate == UINT_MAX)		/* see RFC 4342, 8.5 */
+		return 0;
+	if (unlikely(loss_event_rate == 0))		/* map 1/0 into 100% */
+		return 1000000;
+	return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
+}
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index a10a61a1ded..a8ed459508b 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -153,18 +153,27 @@ static inline u64 max48(const u64 seq1, const u64 seq2)
 }
 
 /**
- * dccp_loss_free  -  Evaluates condition for data loss from RFC 4340, 7.7.1
- * @s1:	 start sequence number
- * @s2:  end sequence number
+ * dccp_loss_count - Approximate the number of lost data packets in a burst loss
+ * @s1:  last known sequence number before the loss ('hole')
+ * @s2:  first sequence number seen after the 'hole'
  * @ndp: NDP count on packet with sequence number @s2
- * Returns true if the sequence range s1...s2 has no data loss.
  */
-static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp)
+static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp)
 {
 	s64 delta = dccp_delta_seqno(s1, s2);
 
 	WARN_ON(delta < 0);
-	return (u64)delta <= ndp + 1;
+	delta -= ndp + 1;
+
+	return delta > 0 ? delta : 0;
+}
+
+/**
+ * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1
+ */
+static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp)
+{
+	return dccp_loss_count(s1, s2, ndp) == 0;
 }
 
 enum {
@@ -234,8 +243,9 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 extern void dccp_send_sync(struct sock *sk, const u64 seq,
 			   const enum dccp_pkt_type pkt_type);
 
-extern void dccp_write_xmit(struct sock *sk, int block);
-extern void dccp_write_space(struct sock *sk);
+extern void   dccp_write_xmit(struct sock *sk);
+extern void   dccp_write_space(struct sock *sk);
+extern void   dccp_flush_write_queue(struct sock *sk, long *time_budget);
 
 extern void dccp_init_xmit_timers(struct sock *sk);
 static inline void dccp_clear_xmit_timers(struct sock *sk)
@@ -246,7 +256,6 @@ static inline void dccp_clear_xmit_timers(struct sock *sk)
 extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
 
 extern const char *dccp_packet_name(const int type);
-extern const char *dccp_state_name(const int state);
 
 extern void dccp_set_state(struct sock *sk, const int state);
 extern void dccp_done(struct sock *sk);
@@ -415,6 +424,23 @@ static inline void dccp_update_gsr(struct sock *sk, u64 seq)
 	dp->dccps_gsr = seq;
 	/* Sequence validity window depends on remote Sequence Window (7.5.1) */
 	dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
+	/*
+	 * Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
+	 * 7.5.1 we perform this check beyond the initial handshake: W/W' are
+	 * always > 32, so for the first W/W' packets in the lifetime of a
+	 * connection we always have to adjust SWL.
+	 * A second reason why we are doing this is that the window depends on
+	 * the feature-remote value of Sequence Window: nothing stops the peer
+	 * from updating this value while we are busy adjusting SWL for the
+	 * first W packets (we would have to count from scratch again then).
+	 * Therefore it is safer to always make sure that the Sequence Window
+	 * is not artificially extended by a peer who grows SWL downwards by
+	 * continually updating the feature-remote Sequence-Window.
+	 * If sequence numbers wrap it is bad luck. But that will take a while
+	 * (48 bit), and this measure prevents Sequence-number attacks.
+	 */
+	if (before48(dp->dccps_swl, dp->dccps_isr))
+		dp->dccps_swl = dp->dccps_isr;
 	dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
 }
 
@@ -425,14 +451,16 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq)
 	dp->dccps_gss = seq;
 	/* Ack validity window depends on local Sequence Window value (7.5.1) */
 	dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
+	/* Adjust AWL so that it is not below ISS - see comment above for SWL */
+	if (before48(dp->dccps_awl, dp->dccps_iss))
+		dp->dccps_awl = dp->dccps_iss;
 	dp->dccps_awh = dp->dccps_gss;
 }
 
 static inline int dccp_ack_pending(const struct sock *sk)
 {
 	const struct dccp_sock *dp = dccp_sk(sk);
-	return dp->dccps_timestamp_echo != 0 ||
-	       (dp->dccps_hc_rx_ackvec != NULL &&
+	return (dp->dccps_hc_rx_ackvec != NULL &&
 		dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
 	       inet_csk_ack_scheduled(sk);
 }
@@ -446,16 +474,11 @@ extern void dccp_feat_list_purge(struct list_head *fn_list);
 
 extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
 extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*);
-extern int dccp_insert_option_elapsed_time(struct sock *sk,
-					    struct sk_buff *skb,
-					    u32 elapsed_time);
+extern int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed);
 extern u32 dccp_timestamp(void);
 extern void dccp_timestamping_init(void);
-extern int dccp_insert_option_timestamp(struct sock *sk,
-					 struct sk_buff *skb);
-extern int dccp_insert_option(struct sock *sk, struct sk_buff *skb,
-			       unsigned char option,
-			       const void *value, unsigned char len);
+extern int dccp_insert_option(struct sk_buff *skb, unsigned char option,
+			      const void *value, unsigned char len);
 
 #ifdef CONFIG_SYSCTL
 extern int dccp_sysctl_init(void);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index df7dd26cf07..568def95272 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -730,16 +730,6 @@ int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
 				  0, list, len);
 }
 
-/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */
-int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val)
-{
-	/* any changes must be registered before establishing the connection */
-	if (sk->sk_state != DCCP_CLOSED)
-		return -EISCONN;
-	if (dccp_feat_type(feat) != FEAT_NN)
-		return -EINVAL;
-	return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val);
-}
 
 /*
  *	Tracking features whose value depend on the choice of CCID
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index f96721619de..e56a4e5e634 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -111,7 +111,6 @@ extern int  dccp_feat_init(struct sock *sk);
 extern void dccp_feat_initialise_sysctls(void);
 extern int  dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
 				  u8 const *list, u8 len);
-extern int  dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
 extern int  dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
 				    u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
 extern int  dccp_feat_clone_list(struct list_head const *, struct list_head *);
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 6beb6a7d6fb..265985370fa 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -259,7 +259,7 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
 				      sysctl_dccp_sync_ratelimit)))
 			return 0;
 
-		DCCP_WARN("DCCP: Step 6 failed for %s packet, "
+		DCCP_WARN("Step 6 failed for %s packet, "
 			  "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
 			  "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
 			  "sending SYNC...\n",  dccp_packet_name(dh->dccph_type),
@@ -430,7 +430,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 		if (dccp_parse_options(sk, NULL, skb))
 			return 1;
 
-		/* Obtain usec RTT sample from SYN exchange (used by CCID 3) */
+		/* Obtain usec RTT sample from SYN exchange (used by TFRC). */
 		if (likely(dp->dccps_options_received.dccpor_timestamp_echo))
 			dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp -
 			    dp->dccps_options_received.dccpor_timestamp_echo));
@@ -441,20 +441,14 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 		kfree_skb(sk->sk_send_head);
 		sk->sk_send_head = NULL;
 
-		dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
-		dccp_update_gsr(sk, dp->dccps_isr);
 		/*
-		 * SWL and AWL are initially adjusted so that they are not less than
-		 * the initial Sequence Numbers received and sent, respectively:
-		 *	SWL := max(GSR + 1 - floor(W/4), ISR),
-		 *	AWL := max(GSS - W' + 1, ISS).
-		 * These adjustments MUST be applied only at the beginning of the
-		 * connection.
-		 *
-		 * AWL was adjusted in dccp_v4_connect -acme
+		 * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect
+		 * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH
+		 * is done as part of activating the feature values below, since
+		 * these settings depend on the local/remote Sequence Window
+		 * features, which were undefined or not confirmed until now.
 		 */
-		dccp_set_seqno(&dp->dccps_swl,
-			       max48(dp->dccps_swl, dp->dccps_isr));
+		dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
 
 		dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 
@@ -535,6 +529,8 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
 						   const struct dccp_hdr *dh,
 						   const unsigned len)
 {
+	struct dccp_sock *dp = dccp_sk(sk);
+	u32 sample = dp->dccps_options_received.dccpor_timestamp_echo;
 	int queued = 0;
 
 	switch (dh->dccph_type) {
@@ -559,7 +555,14 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
 		if (sk->sk_state == DCCP_PARTOPEN)
 			inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
 
-		dccp_sk(sk)->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
+		/* Obtain usec RTT sample from SYN exchange (used by TFRC). */
+		if (likely(sample)) {
+			long delta = dccp_timestamp() - sample;
+
+			dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta);
+		}
+
+		dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
 		dccp_set_state(sk, DCCP_OPEN);
 
 		if (dh->dccph_type == DCCP_PKT_DATAACK ||
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index d9b11ef8694..3f69ea11482 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -105,7 +105,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		goto failure;
 
 	/* OK, now commit destination to socket.  */
-	sk_setup_caps(sk, &rt->u.dst);
+	sk_setup_caps(sk, &rt->dst);
 
 	dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr,
 						    inet->inet_daddr,
@@ -392,7 +392,7 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	newsk = dccp_create_openreq_child(sk, req, skb);
 	if (newsk == NULL)
-		goto exit;
+		goto exit_nonewsk;
 
 	sk_setup_caps(newsk, dst);
 
@@ -409,16 +409,20 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	dccp_sync_mss(newsk, dst_mtu(dst));
 
+	if (__inet_inherit_port(sk, newsk) < 0) {
+		sock_put(newsk);
+		goto exit;
+	}
 	__inet_hash_nolisten(newsk, NULL);
-	__inet_inherit_port(sk, newsk);
 
 	return newsk;
 
 exit_overflow:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+	dst_release(dst);
 exit:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
-	dst_release(dst);
 	return NULL;
 }
 
@@ -475,7 +479,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
 		return NULL;
 	}
 
-	return &rt->u.dst;
+	return &rt->dst;
 }
 
 static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 09169889959..dca711df9b6 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -248,7 +248,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sk_buff *skb;
 	struct ipv6_txoptions *opt = NULL;
-	struct in6_addr *final_p = NULL, final;
+	struct in6_addr *final_p, final;
 	struct flowi fl;
 	int err = -1;
 	struct dst_entry *dst;
@@ -265,13 +265,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
 
 	opt = np->opt;
 
-	if (opt != NULL && opt->srcrt != NULL) {
-		const struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
-
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
-	}
+	final_p = fl6_update_dst(&fl, opt, &final);
 
 	err = ip6_dst_lookup(sk, &dst, &fl);
 	if (err)
@@ -545,19 +539,13 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 		goto out_overflow;
 
 	if (dst == NULL) {
-		struct in6_addr *final_p = NULL, final;
+		struct in6_addr *final_p, final;
 		struct flowi fl;
 
 		memset(&fl, 0, sizeof(fl));
 		fl.proto = IPPROTO_DCCP;
 		ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
-		if (opt != NULL && opt->srcrt != NULL) {
-			const struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
-
-			ipv6_addr_copy(&final, &fl.fl6_dst);
-			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-			final_p = &final;
-		}
+		final_p = fl6_update_dst(&fl, opt, &final);
 		ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
 		fl.oif = sk->sk_bound_dev_if;
 		fl.fl_ip_dport = inet_rsk(req)->rmt_port;
@@ -576,7 +564,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 
 	newsk = dccp_create_openreq_child(sk, req, skb);
 	if (newsk == NULL)
-		goto out;
+		goto out_nonewsk;
 
 	/*
 	 * No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -644,18 +632,22 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 	newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
 	newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
 
+	if (__inet_inherit_port(sk, newsk) < 0) {
+		sock_put(newsk);
+		goto out;
+	}
 	__inet6_hash(newsk, NULL);
-	__inet_inherit_port(sk, newsk);
 
 	return newsk;
 
 out_overflow:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+out_nonewsk:
+	dst_release(dst);
 out:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
 	if (opt != NULL && opt != np->opt)
 		sock_kfree_s(sk, opt, opt->tot_len);
-	dst_release(dst);
 	return NULL;
 }
 
@@ -885,7 +877,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct dccp_sock *dp = dccp_sk(sk);
-	struct in6_addr *saddr = NULL, *final_p = NULL, final;
+	struct in6_addr *saddr = NULL, *final_p, final;
 	struct flowi fl;
 	struct dst_entry *dst;
 	int addr_type;
@@ -988,13 +980,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl.fl_ip_sport = inet->inet_sport;
 	security_sk_classify_flow(sk, &fl);
 
-	if (np->opt != NULL && np->opt->srcrt != NULL) {
-		const struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
-
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
-	}
+	final_p = fl6_update_dst(&fl, np->opt, &final);
 
 	err = ip6_dst_lookup(sk, &dst, &fl);
 	if (err)
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 128b089d3ae..d7041a0963a 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -121,30 +121,18 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		 *
 		 *    Choose S.ISS (initial seqno) or set from Init Cookies
 		 *    Initialize S.GAR := S.ISS
-		 *    Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
-		 */
-		newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss;
-		dccp_update_gss(newsk, dreq->dreq_iss);
-
-		newdp->dccps_isr = dreq->dreq_isr;
-		dccp_update_gsr(newsk, dreq->dreq_isr);
-
-		/*
-		 * SWL and AWL are initially adjusted so that they are not less than
-		 * the initial Sequence Numbers received and sent, respectively:
-		 *	SWL := max(GSR + 1 - floor(W/4), ISR),
-		 *	AWL := max(GSS - W' + 1, ISS).
-		 * These adjustments MUST be applied only at the beginning of the
-		 * connection.
+		 *    Set S.ISR, S.GSR from packet (or Init Cookies)
+		 *
+		 *    Setting AWL/AWH and SWL/SWH happens as part of the feature
+		 *    activation below, as these windows all depend on the local
+		 *    and remote Sequence Window feature values (7.5.2).
 		 */
-		dccp_set_seqno(&newdp->dccps_swl,
-			       max48(newdp->dccps_swl, newdp->dccps_isr));
-		dccp_set_seqno(&newdp->dccps_awl,
-			       max48(newdp->dccps_awl, newdp->dccps_iss));
+		newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss;
+		newdp->dccps_gar = newdp->dccps_iss;
+		newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr;
 
 		/*
-		 * Activate features after initialising the sequence numbers,
-		 * since CCID initialisation may depend on GSS, ISR, ISS etc.
+		 * Activate features: initialise CCIDs, sequence windows etc.
 		 */
 		if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
 			/* It is still raw copy of parent, so invalidate
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 07395f861d3..cd306181300 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -96,18 +96,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 		}
 
 		/*
-		 * CCID-Specific Options (from RFC 4340, sec. 10.3):
-		 *
-		 * Option numbers 128 through 191 are for options sent from the
-		 * HC-Sender to the HC-Receiver; option numbers 192 through 255
-		 * are for options sent from the HC-Receiver to	the HC-Sender.
-		 *
 		 * CCID-specific options are ignored during connection setup, as
 		 * negotiation may still be in progress (see RFC 4340, 10.3).
 		 * The same applies to Ack Vectors, as these depend on the CCID.
-		 *
 		 */
-		if (dreq != NULL && (opt >= 128 ||
+		if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC ||
 		    opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
 			goto ignore_option;
 
@@ -170,6 +163,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 				      dccp_role(sk), ntohl(opt_val),
 				      (unsigned long long)
 				      DCCP_SKB_CB(skb)->dccpd_ack_seq);
+			/* schedule an Ack in case this sender is quiescent */
+			inet_csk_schedule_ack(sk);
 			break;
 		case DCCPO_TIMESTAMP_ECHO:
 			if (len != 4 && len != 6 && len != 8)
@@ -226,23 +221,15 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 			dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
 				      dccp_role(sk), elapsed_time);
 			break;
-		case 128 ... 191: {
-			const u16 idx = value - options;
-
+		case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC:
 			if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
-						     opt, len, idx,
-						     value) != 0)
+						     pkt_type, opt, value, len))
 				goto out_invalid_option;
-		}
 			break;
-		case 192 ... 255: {
-			const u16 idx = value - options;
-
+		case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
 			if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
-						     opt, len, idx,
-						     value) != 0)
+						     pkt_type, opt, value, len))
 				goto out_invalid_option;
-		}
 			break;
 		default:
 			DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
@@ -299,9 +286,8 @@ static inline u8 dccp_ndp_len(const u64 ndp)
 	return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6);
 }
 
-int dccp_insert_option(struct sock *sk, struct sk_buff *skb,
-			const unsigned char option,
-			const void *value, const unsigned char len)
+int dccp_insert_option(struct sk_buff *skb, const unsigned char option,
+		       const void *value, const unsigned char len)
 {
 	unsigned char *to;
 
@@ -354,8 +340,7 @@ static inline int dccp_elapsed_time_len(const u32 elapsed_time)
 	return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
 }
 
-int dccp_insert_option_elapsed_time(struct sock *sk, struct sk_buff *skb,
-				    u32 elapsed_time)
+int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed_time)
 {
 	const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
 	const int len = 2 + elapsed_time_len;
@@ -386,17 +371,15 @@ int dccp_insert_option_elapsed_time(struct sock *sk, struct sk_buff *skb,
 
 EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time);
 
-int dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb)
+static int dccp_insert_option_timestamp(struct sk_buff *skb)
 {
 	__be32 now = htonl(dccp_timestamp());
 	/* yes this will overflow but that is the point as we want a
 	 * 10 usec 32 bit timer which mean it wraps every 11.9 hours */
 
-	return dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now));
+	return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now));
 }
 
-EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp);
-
 static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
 					     struct dccp_request_sock *dreq,
 					     struct sk_buff *skb)
@@ -531,9 +514,9 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 		if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) {
 			/*
 			 * Obtain RTT sample from Request/Response exchange.
-			 * This is currently used in CCID 3 initialisation.
+			 * This is currently used for TFRC initialisation.
 			 */
-			if (dccp_insert_option_timestamp(sk, skb))
+			if (dccp_insert_option_timestamp(skb))
 				return -1;
 
 		} else if (dp->dccps_hc_rx_ackvec != NULL &&
@@ -564,6 +547,10 @@ int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb)
 	if (dccp_feat_insert_opts(NULL, dreq, skb))
 		return -1;
 
+	/* Obtain RTT sample from Response/Ack exchange (used by TFRC). */
+	if (dccp_insert_option_timestamp(skb))
+		return -1;
+
 	if (dreq->dreq_timestamp_echo != 0 &&
 	    dccp_insert_option_timestamp_echo(NULL, dreq, skb))
 		return -1;
diff --git a/net/dccp/output.c b/net/dccp/output.c
index aadbdb58758..45b91853f5a 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -209,108 +209,150 @@ void dccp_write_space(struct sock *sk)
 }
 
 /**
- * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet
+ * dccp_wait_for_ccid  -  Await CCID send permission
  * @sk:    socket to wait for
- * @skb:   current skb to pass on for waiting
- * @delay: sleep timeout in milliseconds (> 0)
- * This function is called by default when the socket is closed, and
- * when a non-zero linger time is set on the socket. For consistency
+ * @delay: timeout in jiffies
+ * This is used by CCIDs which need to delay the send time in process context.
  */
-static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay)
+static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
 	DEFINE_WAIT(wait);
-	unsigned long jiffdelay;
-	int rc;
+	long remaining;
+
+	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+	sk->sk_write_pending++;
+	release_sock(sk);
+
+	remaining = schedule_timeout(delay);
+
+	lock_sock(sk);
+	sk->sk_write_pending--;
+	finish_wait(sk_sleep(sk), &wait);
+
+	if (signal_pending(current) || sk->sk_err)
+		return -1;
+	return remaining;
+}
+
+/**
+ * dccp_xmit_packet  -  Send data packet under control of CCID
+ * Transmits next-queued payload and informs CCID to account for the packet.
+ */
+static void dccp_xmit_packet(struct sock *sk)
+{
+	int err, len;
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct sk_buff *skb = skb_dequeue(&sk->sk_write_queue);
+
+	if (unlikely(skb == NULL))
+		return;
+	len = skb->len;
 
-	do {
-		dccp_pr_debug("delayed send by %d msec\n", delay);
-		jiffdelay = msecs_to_jiffies(delay);
+	if (sk->sk_state == DCCP_PARTOPEN) {
+		const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
+		/*
+		 * See 8.1.5 - Handshake Completion.
+		 *
+		 * For robustness we resend Confirm options until the client has
+		 * entered OPEN. During the initial feature negotiation, the MPS
+		 * is smaller than usual, reduced by the Change/Confirm options.
+		 */
+		if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
+			DCCP_WARN("Payload too large (%d) for featneg.\n", len);
+			dccp_send_ack(sk);
+			dccp_feat_list_purge(&dp->dccps_featneg);
+		}
 
-		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+		inet_csk_schedule_ack(sk);
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+					      inet_csk(sk)->icsk_rto,
+					      DCCP_RTO_MAX);
+		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
+	} else if (dccp_ack_pending(sk)) {
+		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
+	} else {
+		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA;
+	}
 
-		sk->sk_write_pending++;
-		release_sock(sk);
-		schedule_timeout(jiffdelay);
-		lock_sock(sk);
-		sk->sk_write_pending--;
+	err = dccp_transmit_skb(sk, skb);
+	if (err)
+		dccp_pr_debug("transmit_skb() returned err=%d\n", err);
+	/*
+	 * Register this one as sent even if an error occurred. To the remote
+	 * end a local packet drop is indistinguishable from network loss, i.e.
+	 * any local drop will eventually be reported via receiver feedback.
+	 */
+	ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
+}
 
-		if (sk->sk_err)
-			goto do_error;
-		if (signal_pending(current))
-			goto do_interrupted;
+/**
+ * dccp_flush_write_queue  -  Drain queue at end of connection
+ * Since dccp_sendmsg queues packets without waiting for them to be sent, it may
+ * happen that the TX queue is not empty at the end of a connection. We give the
+ * HC-sender CCID a grace period of up to @time_budget jiffies. If this function
+ * returns with a non-empty write queue, it will be purged later.
+ */
+void dccp_flush_write_queue(struct sock *sk, long *time_budget)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct sk_buff *skb;
+	long delay, rc;
 
+	while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
 		rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
-	} while ((delay = rc) > 0);
-out:
-	finish_wait(sk_sleep(sk), &wait);
-	return rc;
-
-do_error:
-	rc = -EPIPE;
-	goto out;
-do_interrupted:
-	rc = -EINTR;
-	goto out;
+
+		switch (ccid_packet_dequeue_eval(rc)) {
+		case CCID_PACKET_WILL_DEQUEUE_LATER:
+			/*
+			 * If the CCID determines when to send, the next sending
+			 * time is unknown or the CCID may not even send again
+			 * (e.g. remote host crashes or lost Ack packets).
+			 */
+			DCCP_WARN("CCID did not manage to send all packets\n");
+			return;
+		case CCID_PACKET_DELAY:
+			delay = msecs_to_jiffies(rc);
+			if (delay > *time_budget)
+				return;
+			rc = dccp_wait_for_ccid(sk, delay);
+			if (rc < 0)
+				return;
+			*time_budget -= (delay - rc);
+			/* check again if we can send now */
+			break;
+		case CCID_PACKET_SEND_AT_ONCE:
+			dccp_xmit_packet(sk);
+			break;
+		case CCID_PACKET_ERR:
+			skb_dequeue(&sk->sk_write_queue);
+			kfree_skb(skb);
+			dccp_pr_debug("packet discarded due to err=%ld\n", rc);
+		}
+	}
 }
 
-void dccp_write_xmit(struct sock *sk, int block)
+void dccp_write_xmit(struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct sk_buff *skb;
 
 	while ((skb = skb_peek(&sk->sk_write_queue))) {
-		int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
-
-		if (err > 0) {
-			if (!block) {
-				sk_reset_timer(sk, &dp->dccps_xmit_timer,
-						msecs_to_jiffies(err)+jiffies);
-				break;
-			} else
-				err = dccp_wait_for_ccid(sk, skb, err);
-			if (err && err != -EINTR)
-				DCCP_BUG("err=%d after dccp_wait_for_ccid", err);
-		}
+		int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
 
-		skb_dequeue(&sk->sk_write_queue);
-		if (err == 0) {
-			struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
-			const int len = skb->len;
-
-			if (sk->sk_state == DCCP_PARTOPEN) {
-				const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
-				/*
-				 * See 8.1.5 - Handshake Completion.
-				 *
-				 * For robustness we resend Confirm options until the client has
-				 * entered OPEN. During the initial feature negotiation, the MPS
-				 * is smaller than usual, reduced by the Change/Confirm options.
-				 */
-				if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
-					DCCP_WARN("Payload too large (%d) for featneg.\n", len);
-					dccp_send_ack(sk);
-					dccp_feat_list_purge(&dp->dccps_featneg);
-				}
-
-				inet_csk_schedule_ack(sk);
-				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
-						  inet_csk(sk)->icsk_rto,
-						  DCCP_RTO_MAX);
-				dcb->dccpd_type = DCCP_PKT_DATAACK;
-			} else if (dccp_ack_pending(sk))
-				dcb->dccpd_type = DCCP_PKT_DATAACK;
-			else
-				dcb->dccpd_type = DCCP_PKT_DATA;
-
-			err = dccp_transmit_skb(sk, skb);
-			ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
-			if (err)
-				DCCP_BUG("err=%d after ccid_hc_tx_packet_sent",
-					 err);
-		} else {
-			dccp_pr_debug("packet discarded due to err=%d\n", err);
+		switch (ccid_packet_dequeue_eval(rc)) {
+		case CCID_PACKET_WILL_DEQUEUE_LATER:
+			return;
+		case CCID_PACKET_DELAY:
+			sk_reset_timer(sk, &dp->dccps_xmit_timer,
+				       jiffies + msecs_to_jiffies(rc));
+			return;
+		case CCID_PACKET_SEND_AT_ONCE:
+			dccp_xmit_packet(sk);
+			break;
+		case CCID_PACKET_ERR:
+			skb_dequeue(&sk->sk_write_queue);
 			kfree_skb(skb);
+			dccp_pr_debug("packet discarded due to err=%d\n", rc);
 		}
 	}
 }
@@ -474,8 +516,9 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
 /*
  * Do all connect socket setups that can be done AF independent.
  */
-static inline void dccp_connect_init(struct sock *sk)
+int dccp_connect(struct sock *sk)
 {
+	struct sk_buff *skb;
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
@@ -485,22 +528,12 @@ static inline void dccp_connect_init(struct sock *sk)
 
 	dccp_sync_mss(sk, dst_mtu(dst));
 
-	/* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
-	dp->dccps_gar = dp->dccps_iss;
-
-	icsk->icsk_retransmits = 0;
-}
-
-int dccp_connect(struct sock *sk)
-{
-	struct sk_buff *skb;
-	struct inet_connection_sock *icsk = inet_csk(sk);
-
 	/* do not connect if feature negotiation setup fails */
 	if (dccp_feat_finalise_settings(dccp_sk(sk)))
 		return -EPROTO;
 
-	dccp_connect_init(sk);
+	/* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
+	dp->dccps_gar = dp->dccps_iss;
 
 	skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
 	if (unlikely(skb == NULL))
@@ -516,6 +549,7 @@ int dccp_connect(struct sock *sk)
 	DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
 
 	/* Timer for repeating the REQUEST until an answer. */
+	icsk->icsk_retransmits = 0;
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 				  icsk->icsk_rto, DCCP_RTO_MAX);
 	return 0;
@@ -630,7 +664,6 @@ void dccp_send_close(struct sock *sk, const int active)
 		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
 
 	if (active) {
-		dccp_write_xmit(sk, 1);
 		dccp_skb_entail(sk, skb);
 		dccp_transmit_skb(sk, skb_clone(skb, prio));
 		/*
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index 078e48d442f..33d0e6297c2 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -149,6 +149,7 @@ static const struct file_operations dccpprobe_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = dccpprobe_open,
 	.read    = dccpprobe_read,
+	.llseek  = noop_llseek,
 };
 
 static __init int dccpprobe_init(void)
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index b03ecf6b2bb..ef343d53fce 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -50,6 +50,30 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo);
 /* the maximum queue length for tx in packets. 0 is no limit */
 int sysctl_dccp_tx_qlen __read_mostly = 5;
 
+#ifdef CONFIG_IP_DCCP_DEBUG
+static const char *dccp_state_name(const int state)
+{
+	static const char *const dccp_state_names[] = {
+	[DCCP_OPEN]		= "OPEN",
+	[DCCP_REQUESTING]	= "REQUESTING",
+	[DCCP_PARTOPEN]		= "PARTOPEN",
+	[DCCP_LISTEN]		= "LISTEN",
+	[DCCP_RESPOND]		= "RESPOND",
+	[DCCP_CLOSING]		= "CLOSING",
+	[DCCP_ACTIVE_CLOSEREQ]	= "CLOSEREQ",
+	[DCCP_PASSIVE_CLOSE]	= "PASSIVE_CLOSE",
+	[DCCP_PASSIVE_CLOSEREQ]	= "PASSIVE_CLOSEREQ",
+	[DCCP_TIME_WAIT]	= "TIME_WAIT",
+	[DCCP_CLOSED]		= "CLOSED",
+	};
+
+	if (state >= DCCP_MAX_STATES)
+		return "INVALID STATE!";
+	else
+		return dccp_state_names[state];
+}
+#endif
+
 void dccp_set_state(struct sock *sk, const int state)
 {
 	const int oldstate = sk->sk_state;
@@ -146,30 +170,6 @@ const char *dccp_packet_name(const int type)
 
 EXPORT_SYMBOL_GPL(dccp_packet_name);
 
-const char *dccp_state_name(const int state)
-{
-	static const char *const dccp_state_names[] = {
-	[DCCP_OPEN]		= "OPEN",
-	[DCCP_REQUESTING]	= "REQUESTING",
-	[DCCP_PARTOPEN]		= "PARTOPEN",
-	[DCCP_LISTEN]		= "LISTEN",
-	[DCCP_RESPOND]		= "RESPOND",
-	[DCCP_CLOSING]		= "CLOSING",
-	[DCCP_ACTIVE_CLOSEREQ]	= "CLOSEREQ",
-	[DCCP_PASSIVE_CLOSE]	= "PASSIVE_CLOSE",
-	[DCCP_PASSIVE_CLOSEREQ]	= "PASSIVE_CLOSEREQ",
-	[DCCP_TIME_WAIT]	= "TIME_WAIT",
-	[DCCP_CLOSED]		= "CLOSED",
-	};
-
-	if (state >= DCCP_MAX_STATES)
-		return "INVALID STATE!";
-	else
-		return dccp_state_names[state];
-}
-
-EXPORT_SYMBOL_GPL(dccp_state_name);
-
 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
@@ -473,14 +473,9 @@ static int dccp_setsockopt_ccid(struct sock *sk, int type,
 	if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
 		return -EINVAL;
 
-	val = kmalloc(optlen, GFP_KERNEL);
-	if (val == NULL)
-		return -ENOMEM;
-
-	if (copy_from_user(val, optval, optlen)) {
-		kfree(val);
-		return -EFAULT;
-	}
+	val = memdup_user(optval, optlen);
+	if (IS_ERR(val))
+		return PTR_ERR(val);
 
 	lock_sock(sk);
 	if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
@@ -731,7 +726,13 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		goto out_discard;
 
 	skb_queue_tail(&sk->sk_write_queue, skb);
-	dccp_write_xmit(sk,0);
+	/*
+	 * The xmit_timer is set if the TX CCID is rate-based and will expire
+	 * when congestion control permits to release further packets into the
+	 * network. Window-based CCIDs do not use this timer.
+	 */
+	if (!timer_pending(&dp->dccps_xmit_timer))
+		dccp_write_xmit(sk);
 out_release:
 	release_sock(sk);
 	return rc ? : len;
@@ -949,16 +950,29 @@ void dccp_close(struct sock *sk, long timeout)
 
 	if (data_was_unread) {
 		/* Unread data was tossed, send an appropriate Reset Code */
-		DCCP_WARN("DCCP: ABORT -- %u bytes unread\n", data_was_unread);
+		DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
 		dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
 		dccp_set_state(sk, DCCP_CLOSED);
 	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
 		/* Check zero linger _after_ checking for unread data. */
 		sk->sk_prot->disconnect(sk, 0);
 	} else if (sk->sk_state != DCCP_CLOSED) {
+		/*
+		 * Normal connection termination. May need to wait if there are
+		 * still packets in the TX queue that are delayed by the CCID.
+		 */
+		dccp_flush_write_queue(sk, &timeout);
 		dccp_terminate_connection(sk);
 	}
 
+	/*
+	 * Flush write queue. This may be necessary in several cases:
+	 * - we have been closed by the peer but still have application data;
+	 * - abortive termination (unread data or zero linger time),
+	 * - normal termination but queue could not be flushed within time limit
+	 */
+	__skb_queue_purge(&sk->sk_write_queue);
+
 	sk_stream_wait_close(sk, timeout);
 
 adjudge_to_death:
@@ -1007,7 +1021,8 @@ EXPORT_SYMBOL_GPL(dccp_shutdown);
 static inline int dccp_mib_init(void)
 {
 	return snmp_mib_init((void __percpu **)dccp_statistics,
-			     sizeof(struct dccp_mib));
+			     sizeof(struct dccp_mib),
+			     __alignof__(struct dccp_mib));
 }
 
 static inline void dccp_mib_exit(void)
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 1a9aa05d4dc..7587870b704 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -237,32 +237,35 @@ out:
 	sock_put(sk);
 }
 
-/* Transmit-delay timer: used by the CCIDs to delay actual send time */
-static void dccp_write_xmit_timer(unsigned long data)
+/**
+ * dccp_write_xmitlet  -  Workhorse for CCID packet dequeueing interface
+ * See the comments above %ccid_dequeueing_decision for supported modes.
+ */
+static void dccp_write_xmitlet(unsigned long data)
 {
 	struct sock *sk = (struct sock *)data;
-	struct dccp_sock *dp = dccp_sk(sk);
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk))
-		sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1);
+		sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
 	else
-		dccp_write_xmit(sk, 0);
+		dccp_write_xmit(sk);
 	bh_unlock_sock(sk);
-	sock_put(sk);
 }
 
-static void dccp_init_write_xmit_timer(struct sock *sk)
+static void dccp_write_xmit_timer(unsigned long data)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
-
-	setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
-			(unsigned long)sk);
+	dccp_write_xmitlet(data);
+	sock_put((struct sock *)data);
 }
 
 void dccp_init_xmit_timers(struct sock *sk)
 {
-	dccp_init_write_xmit_timer(sk);
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
+	setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
+							     (unsigned long)sk);
 	inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
 				  &dccp_keepalive_timer);
 }
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 0363bb95cc7..a085dbcf5c7 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -48,7 +48,6 @@
 #include <net/dn_neigh.h>
 #include <net/dn_route.h>
 
-static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev);
 static int dn_neigh_construct(struct neighbour *);
 static void dn_long_error_report(struct neighbour *, struct sk_buff *);
 static void dn_short_error_report(struct neighbour *, struct sk_buff *);
@@ -93,6 +92,13 @@ static const struct neigh_ops dn_phase3_ops = {
 	.queue_xmit =		dev_queue_xmit
 };
 
+static u32 dn_neigh_hash(const void *pkey,
+			 const struct net_device *dev,
+			 __u32 hash_rnd)
+{
+	return jhash_2words(*(__u16 *)pkey, 0, hash_rnd);
+}
+
 struct neigh_table dn_neigh_table = {
 	.family =			PF_DECnet,
 	.entry_size =			sizeof(struct dn_neigh),
@@ -122,11 +128,6 @@ struct neigh_table dn_neigh_table = {
 	.gc_thresh3 =			1024,
 };
 
-static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev)
-{
-	return jhash_2words(*(__u16 *)pkey, 0, dn_neigh_table.hash_rnd);
-}
-
 static int dn_neigh_construct(struct neighbour *neigh)
 {
 	struct net_device *dev = neigh->dev;
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index baeb1eaf011..2ef115277be 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -693,22 +693,22 @@ void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg)
 	aux = scp->accessdata.acc_userl;
 	*skb_put(skb, 1) = aux;
 	if (aux > 0)
-	memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux);
+		memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux);
 
 	aux = scp->accessdata.acc_passl;
 	*skb_put(skb, 1) = aux;
 	if (aux > 0)
-	memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux);
+		memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux);
 
 	aux = scp->accessdata.acc_accl;
 	*skb_put(skb, 1) = aux;
 	if (aux > 0)
-	memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux);
+		memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux);
 
 	aux = (__u8)le16_to_cpu(scp->conndata_out.opt_optl);
 	*skb_put(skb, 1) = aux;
 	if (aux > 0)
-	memcpy(skb_put(skb,aux), scp->conndata_out.opt_data, aux);
+		memcpy(skb_put(skb, aux), scp->conndata_out.opt_data, aux);
 
 	scp->persist = dn_nsp_persist(sk);
 	scp->persist_fxn = dn_nsp_retrans_conninit;
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 812e6dff606..df0f3e54ff8 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -132,7 +132,6 @@ static struct dst_ops dn_dst_ops = {
 	.negative_advice =	dn_dst_negative_advice,
 	.link_failure =		dn_dst_link_failure,
 	.update_pmtu =		dn_dst_update_pmtu,
-	.entries =		ATOMIC_INIT(0),
 };
 
 static __inline__ unsigned dn_hash(__le16 src, __le16 dst)
@@ -146,13 +145,13 @@ static __inline__ unsigned dn_hash(__le16 src, __le16 dst)
 
 static inline void dnrt_free(struct dn_route *rt)
 {
-	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 }
 
 static inline void dnrt_drop(struct dn_route *rt)
 {
-	dst_release(&rt->u.dst);
-	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+	dst_release(&rt->dst);
+	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 }
 
 static void dn_dst_check_expire(unsigned long dummy)
@@ -167,13 +166,13 @@ static void dn_dst_check_expire(unsigned long dummy)
 
 		spin_lock(&dn_rt_hash_table[i].lock);
 		while((rt=*rtp) != NULL) {
-			if (atomic_read(&rt->u.dst.__refcnt) ||
-					(now - rt->u.dst.lastuse) < expire) {
-				rtp = &rt->u.dst.dn_next;
+			if (atomic_read(&rt->dst.__refcnt) ||
+					(now - rt->dst.lastuse) < expire) {
+				rtp = &rt->dst.dn_next;
 				continue;
 			}
-			*rtp = rt->u.dst.dn_next;
-			rt->u.dst.dn_next = NULL;
+			*rtp = rt->dst.dn_next;
+			rt->dst.dn_next = NULL;
 			dnrt_free(rt);
 		}
 		spin_unlock(&dn_rt_hash_table[i].lock);
@@ -198,13 +197,13 @@ static int dn_dst_gc(struct dst_ops *ops)
 		rtp = &dn_rt_hash_table[i].chain;
 
 		while((rt=*rtp) != NULL) {
-			if (atomic_read(&rt->u.dst.__refcnt) ||
-					(now - rt->u.dst.lastuse) < expire) {
-				rtp = &rt->u.dst.dn_next;
+			if (atomic_read(&rt->dst.__refcnt) ||
+					(now - rt->dst.lastuse) < expire) {
+				rtp = &rt->dst.dn_next;
 				continue;
 			}
-			*rtp = rt->u.dst.dn_next;
-			rt->u.dst.dn_next = NULL;
+			*rtp = rt->dst.dn_next;
+			rt->dst.dn_next = NULL;
 			dnrt_drop(rt);
 			break;
 		}
@@ -287,25 +286,25 @@ static int dn_insert_route(struct dn_route *rt, unsigned hash, struct dn_route *
 	while((rth = *rthp) != NULL) {
 		if (compare_keys(&rth->fl, &rt->fl)) {
 			/* Put it first */
-			*rthp = rth->u.dst.dn_next;
-			rcu_assign_pointer(rth->u.dst.dn_next,
+			*rthp = rth->dst.dn_next;
+			rcu_assign_pointer(rth->dst.dn_next,
 					   dn_rt_hash_table[hash].chain);
 			rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth);
 
-			dst_use(&rth->u.dst, now);
+			dst_use(&rth->dst, now);
 			spin_unlock_bh(&dn_rt_hash_table[hash].lock);
 
 			dnrt_drop(rt);
 			*rp = rth;
 			return 0;
 		}
-		rthp = &rth->u.dst.dn_next;
+		rthp = &rth->dst.dn_next;
 	}
 
-	rcu_assign_pointer(rt->u.dst.dn_next, dn_rt_hash_table[hash].chain);
+	rcu_assign_pointer(rt->dst.dn_next, dn_rt_hash_table[hash].chain);
 	rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt);
 
-	dst_use(&rt->u.dst, now);
+	dst_use(&rt->dst, now);
 	spin_unlock_bh(&dn_rt_hash_table[hash].lock);
 	*rp = rt;
 	return 0;
@@ -323,8 +322,8 @@ static void dn_run_flush(unsigned long dummy)
 			goto nothing_to_declare;
 
 		for(; rt; rt=next) {
-			next = rt->u.dst.dn_next;
-			rt->u.dst.dn_next = NULL;
+			next = rt->dst.dn_next;
+			rt->dst.dn_next = NULL;
 			dst_free((struct dst_entry *)rt);
 		}
 
@@ -743,7 +742,7 @@ static int dn_forward(struct sk_buff *skb)
 	/* Ensure that we have enough space for headers */
 	rt = (struct dn_route *)skb_dst(skb);
 	header_len = dn_db->use_long ? 21 : 6;
-	if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+header_len))
+	if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+header_len))
 		goto drop;
 
 	/*
@@ -752,7 +751,7 @@ static int dn_forward(struct sk_buff *skb)
 	if (++cb->hops > 30)
 		goto drop;
 
-	skb->dev = rt->u.dst.dev;
+	skb->dev = rt->dst.dev;
 
 	/*
 	 * If packet goes out same interface it came in on, then set
@@ -792,7 +791,7 @@ static int dn_rt_bug(struct sk_buff *skb)
 static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
 {
 	struct dn_fib_info *fi = res->fi;
-	struct net_device *dev = rt->u.dst.dev;
+	struct net_device *dev = rt->dst.dev;
 	struct neighbour *n;
 	unsigned mss;
 
@@ -800,25 +799,25 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
 		if (DN_FIB_RES_GW(*res) &&
 		    DN_FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
 			rt->rt_gateway = DN_FIB_RES_GW(*res);
-		memcpy(rt->u.dst.metrics, fi->fib_metrics,
-		       sizeof(rt->u.dst.metrics));
+		memcpy(rt->dst.metrics, fi->fib_metrics,
+		       sizeof(rt->dst.metrics));
 	}
 	rt->rt_type = res->type;
 
-	if (dev != NULL && rt->u.dst.neighbour == NULL) {
+	if (dev != NULL && rt->dst.neighbour == NULL) {
 		n = __neigh_lookup_errno(&dn_neigh_table, &rt->rt_gateway, dev);
 		if (IS_ERR(n))
 			return PTR_ERR(n);
-		rt->u.dst.neighbour = n;
+		rt->dst.neighbour = n;
 	}
 
-	if (dst_metric(&rt->u.dst, RTAX_MTU) == 0 ||
-	    dst_metric(&rt->u.dst, RTAX_MTU) > rt->u.dst.dev->mtu)
-		rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
-	mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->u.dst));
-	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0 ||
-	    dst_metric(&rt->u.dst, RTAX_ADVMSS) > mss)
-		rt->u.dst.metrics[RTAX_ADVMSS-1] = mss;
+	if (dst_metric(&rt->dst, RTAX_MTU) == 0 ||
+	    dst_metric(&rt->dst, RTAX_MTU) > rt->dst.dev->mtu)
+		rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
+	mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->dst));
+	if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0 ||
+	    dst_metric(&rt->dst, RTAX_ADVMSS) > mss)
+		rt->dst.metrics[RTAX_ADVMSS-1] = mss;
 	return 0;
 }
 
@@ -1096,8 +1095,8 @@ make_route:
 	if (rt == NULL)
 		goto e_nobufs;
 
-	atomic_set(&rt->u.dst.__refcnt, 1);
-	rt->u.dst.flags   = DST_HOST;
+	atomic_set(&rt->dst.__refcnt, 1);
+	rt->dst.flags   = DST_HOST;
 
 	rt->fl.fld_src    = oldflp->fld_src;
 	rt->fl.fld_dst    = oldflp->fld_dst;
@@ -1113,17 +1112,17 @@ make_route:
 	rt->rt_dst_map    = fl.fld_dst;
 	rt->rt_src_map    = fl.fld_src;
 
-	rt->u.dst.dev = dev_out;
+	rt->dst.dev = dev_out;
 	dev_hold(dev_out);
-	rt->u.dst.neighbour = neigh;
+	rt->dst.neighbour = neigh;
 	neigh = NULL;
 
-	rt->u.dst.lastuse = jiffies;
-	rt->u.dst.output  = dn_output;
-	rt->u.dst.input   = dn_rt_bug;
+	rt->dst.lastuse = jiffies;
+	rt->dst.output  = dn_output;
+	rt->dst.input   = dn_rt_bug;
 	rt->rt_flags      = flags;
 	if (flags & RTCF_LOCAL)
-		rt->u.dst.input = dn_nsp_rx;
+		rt->dst.input = dn_nsp_rx;
 
 	err = dn_rt_set_next_hop(rt, &res);
 	if (err)
@@ -1152,7 +1151,7 @@ e_nobufs:
 	err = -ENOBUFS;
 	goto done;
 e_neighbour:
-	dst_free(&rt->u.dst);
+	dst_free(&rt->dst);
 	goto e_nobufs;
 }
 
@@ -1168,15 +1167,15 @@ static int __dn_route_output_key(struct dst_entry **pprt, const struct flowi *fl
 	if (!(flags & MSG_TRYHARD)) {
 		rcu_read_lock_bh();
 		for (rt = rcu_dereference_bh(dn_rt_hash_table[hash].chain); rt;
-			rt = rcu_dereference_bh(rt->u.dst.dn_next)) {
+			rt = rcu_dereference_bh(rt->dst.dn_next)) {
 			if ((flp->fld_dst == rt->fl.fld_dst) &&
 			    (flp->fld_src == rt->fl.fld_src) &&
 			    (flp->mark == rt->fl.mark) &&
 			    (rt->fl.iif == 0) &&
 			    (rt->fl.oif == flp->oif)) {
-				dst_use(&rt->u.dst, jiffies);
+				dst_use(&rt->dst, jiffies);
 				rcu_read_unlock_bh();
-				*pprt = &rt->u.dst;
+				*pprt = &rt->dst;
 				return 0;
 			}
 		}
@@ -1375,29 +1374,29 @@ make_route:
 	rt->fl.iif        = in_dev->ifindex;
 	rt->fl.mark       = fl.mark;
 
-	rt->u.dst.flags = DST_HOST;
-	rt->u.dst.neighbour = neigh;
-	rt->u.dst.dev = out_dev;
-	rt->u.dst.lastuse = jiffies;
-	rt->u.dst.output = dn_rt_bug;
+	rt->dst.flags = DST_HOST;
+	rt->dst.neighbour = neigh;
+	rt->dst.dev = out_dev;
+	rt->dst.lastuse = jiffies;
+	rt->dst.output = dn_rt_bug;
 	switch(res.type) {
 		case RTN_UNICAST:
-			rt->u.dst.input = dn_forward;
+			rt->dst.input = dn_forward;
 			break;
 		case RTN_LOCAL:
-			rt->u.dst.output = dn_output;
-			rt->u.dst.input = dn_nsp_rx;
-			rt->u.dst.dev = in_dev;
+			rt->dst.output = dn_output;
+			rt->dst.input = dn_nsp_rx;
+			rt->dst.dev = in_dev;
 			flags |= RTCF_LOCAL;
 			break;
 		default:
 		case RTN_UNREACHABLE:
 		case RTN_BLACKHOLE:
-			rt->u.dst.input = dst_discard;
+			rt->dst.input = dst_discard;
 	}
 	rt->rt_flags = flags;
-	if (rt->u.dst.dev)
-		dev_hold(rt->u.dst.dev);
+	if (rt->dst.dev)
+		dev_hold(rt->dst.dev);
 
 	err = dn_rt_set_next_hop(rt, &res);
 	if (err)
@@ -1405,7 +1404,7 @@ make_route:
 
 	hash = dn_hash(rt->fl.fld_src, rt->fl.fld_dst);
 	dn_insert_route(rt, hash, &rt);
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 
 done:
 	if (neigh)
@@ -1427,7 +1426,7 @@ e_nobufs:
 	goto done;
 
 e_neighbour:
-	dst_free(&rt->u.dst);
+	dst_free(&rt->dst);
 	goto done;
 }
 
@@ -1442,13 +1441,13 @@ static int dn_route_input(struct sk_buff *skb)
 
 	rcu_read_lock();
 	for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL;
-	    rt = rcu_dereference(rt->u.dst.dn_next)) {
+	    rt = rcu_dereference(rt->dst.dn_next)) {
 		if ((rt->fl.fld_src == cb->src) &&
 		    (rt->fl.fld_dst == cb->dst) &&
 		    (rt->fl.oif == 0) &&
 		    (rt->fl.mark == skb->mark) &&
 		    (rt->fl.iif == cb->iif)) {
-			dst_use(&rt->u.dst, jiffies);
+			dst_use(&rt->dst, jiffies);
 			rcu_read_unlock();
 			skb_dst_set(skb, (struct dst_entry *)rt);
 			return 0;
@@ -1487,8 +1486,8 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
 		r->rtm_src_len = 16;
 		RTA_PUT(skb, RTA_SRC, 2, &rt->fl.fld_src);
 	}
-	if (rt->u.dst.dev)
-		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
+	if (rt->dst.dev)
+		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->dst.dev->ifindex);
 	/*
 	 * Note to self - change this if input routes reverse direction when
 	 * they deal only with inputs and not with replies like they do
@@ -1497,11 +1496,11 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
 	RTA_PUT(skb, RTA_PREFSRC, 2, &rt->rt_local_src);
 	if (rt->rt_daddr != rt->rt_gateway)
 		RTA_PUT(skb, RTA_GATEWAY, 2, &rt->rt_gateway);
-	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
+	if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
 		goto rtattr_failure;
-	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
-	if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0, expires,
-			       rt->u.dst.error) < 0)
+	expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
+	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0, expires,
+			       rt->dst.error) < 0)
 		goto rtattr_failure;
 	if (rt->fl.iif)
 		RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
@@ -1568,8 +1567,8 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 		local_bh_enable();
 		memset(cb, 0, sizeof(struct dn_skb_cb));
 		rt = (struct dn_route *)skb_dst(skb);
-		if (!err && -rt->u.dst.error)
-			err = rt->u.dst.error;
+		if (!err && -rt->dst.error)
+			err = rt->dst.error;
 	} else {
 		int oif = 0;
 		if (rta[RTA_OIF - 1])
@@ -1583,7 +1582,7 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
 	skb->dev = NULL;
 	if (err)
 		goto out_free;
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
@@ -1632,10 +1631,10 @@ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		rcu_read_lock_bh();
 		for(rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0;
 			rt;
-			rt = rcu_dereference_bh(rt->u.dst.dn_next), idx++) {
+			rt = rcu_dereference_bh(rt->dst.dn_next), idx++) {
 			if (idx < s_idx)
 				continue;
-			skb_dst_set(skb, dst_clone(&rt->u.dst));
+			skb_dst_set(skb, dst_clone(&rt->dst));
 			if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
 					cb->nlh->nlmsg_seq, RTM_NEWROUTE,
 					1, NLM_F_MULTI) <= 0) {
@@ -1678,7 +1677,7 @@ static struct dn_route *dn_rt_cache_get_next(struct seq_file *seq, struct dn_rou
 {
 	struct dn_rt_cache_iter_state *s = seq->private;
 
-	rt = rt->u.dst.dn_next;
+	rt = rt->dst.dn_next;
 	while(!rt) {
 		rcu_read_unlock_bh();
 		if (--s->bucket < 0)
@@ -1719,12 +1718,12 @@ static int dn_rt_cache_seq_show(struct seq_file *seq, void *v)
 	char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN];
 
 	seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n",
-			rt->u.dst.dev ? rt->u.dst.dev->name : "*",
+			rt->dst.dev ? rt->dst.dev->name : "*",
 			dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1),
 			dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2),
-			atomic_read(&rt->u.dst.__refcnt),
-			rt->u.dst.__use,
-			(int) dst_metric(&rt->u.dst, RTAX_RTT));
+			atomic_read(&rt->dst.__refcnt),
+			rt->dst.__use,
+			(int) dst_metric(&rt->dst, RTAX_RTT));
 	return 0;
 }
 
@@ -1758,6 +1757,7 @@ void __init dn_route_init(void)
 	dn_dst_ops.kmem_cachep =
 		kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0,
 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	dst_entries_init(&dn_dst_ops);
 	setup_timer(&dn_route_timer, dn_dst_check_expire, 0);
 	dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
 	add_timer(&dn_route_timer);
@@ -1816,5 +1816,6 @@ void __exit dn_route_cleanup(void)
 	dn_run_flush(0);
 
 	proc_net_remove(&init_net, "decnet_cache");
+	dst_entries_destroy(&dn_dst_ops);
 }
 
diff --git a/net/dns_resolver/Kconfig b/net/dns_resolver/Kconfig
new file mode 100644
index 00000000000..50d49f7e047
--- /dev/null
+++ b/net/dns_resolver/Kconfig
@@ -0,0 +1,27 @@
+#
+# Configuration for DNS Resolver
+#
+config DNS_RESOLVER
+	tristate "DNS Resolver support"
+	depends on NET && KEYS
+	help
+	  Saying Y here will include support for the DNS Resolver key type
+	  which can be used to make upcalls to perform DNS lookups in
+	  userspace.
+
+	  DNS Resolver is used to query DNS server for information.  Examples
+	  being resolving a UNC hostname element to an IP address for CIFS or
+	  performing a DNS query for AFSDB records so that AFS can locate a
+	  cell's volume location database servers.
+
+	  DNS Resolver is used by the CIFS and AFS modules, and would support
+	  SMB2 later.  DNS Resolver is supported by the userspace upcall
+	  helper "/sbin/dns.resolver" via /etc/request-key.conf.
+
+	  See <file:Documentation/networking/dns_resolver.txt> for further
+	  information.
+
+	  To compile this as a module, choose M here: the module will be called
+	  dnsresolver.
+
+	  If unsure, say N.
diff --git a/net/dns_resolver/Makefile b/net/dns_resolver/Makefile
new file mode 100644
index 00000000000..c0ef4e71dc4
--- /dev/null
+++ b/net/dns_resolver/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux DNS Resolver.
+#
+
+obj-$(CONFIG_DNS_RESOLVER) += dns_resolver.o
+
+dns_resolver-objs :=  dns_key.o dns_query.o
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
new file mode 100644
index 00000000000..739435a6af3
--- /dev/null
+++ b/net/dns_resolver/dns_key.c
@@ -0,0 +1,293 @@
+/* Key type used to cache DNS lookups made by the kernel
+ *
+ * See Documentation/networking/dns_resolver.txt
+ *
+ *   Copyright (c) 2007 Igor Mammedov
+ *   Author(s): Igor Mammedov (niallain@gmail.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *		David Howells (dhowells@redhat.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/keyctl.h>
+#include <linux/err.h>
+#include <linux/seq_file.h>
+#include <keys/dns_resolver-type.h>
+#include <keys/user-type.h>
+#include "internal.h"
+
+MODULE_DESCRIPTION("DNS Resolver");
+MODULE_AUTHOR("Wang Lei");
+MODULE_LICENSE("GPL");
+
+unsigned dns_resolver_debug;
+module_param_named(debug, dns_resolver_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "DNS Resolver debugging mask");
+
+const struct cred *dns_resolver_cache;
+
+#define	DNS_ERRORNO_OPTION	"dnserror"
+
+/*
+ * Instantiate a user defined key for dns_resolver.
+ *
+ * The data must be a NUL-terminated string, with the NUL char accounted in
+ * datalen.
+ *
+ * If the data contains a '#' characters, then we take the clause after each
+ * one to be an option of the form 'key=value'.  The actual data of interest is
+ * the string leading up to the first '#'.  For instance:
+ *
+ *        "ip1,ip2,...#foo=bar"
+ */
+static int
+dns_resolver_instantiate(struct key *key, const void *_data, size_t datalen)
+{
+	struct user_key_payload *upayload;
+	unsigned long derrno;
+	int ret;
+	size_t result_len = 0;
+	const char *data = _data, *end, *opt;
+
+	kenter("%%%d,%s,'%s',%zu",
+	       key->serial, key->description, data, datalen);
+
+	if (datalen <= 1 || !data || data[datalen - 1] != '\0')
+		return -EINVAL;
+	datalen--;
+
+	/* deal with any options embedded in the data */
+	end = data + datalen;
+	opt = memchr(data, '#', datalen);
+	if (!opt) {
+		/* no options: the entire data is the result */
+		kdebug("no options");
+		result_len = datalen;
+	} else {
+		const char *next_opt;
+
+		result_len = opt - data;
+		opt++;
+		kdebug("options: '%s'", opt);
+		do {
+			const char *eq;
+			int opt_len, opt_nlen, opt_vlen, tmp;
+
+			next_opt = memchr(opt, '#', end - opt) ?: end;
+			opt_len = next_opt - opt;
+			if (!opt_len) {
+				printk(KERN_WARNING
+				       "Empty option to dns_resolver key %d\n",
+				       key->serial);
+				return -EINVAL;
+			}
+
+			eq = memchr(opt, '=', opt_len) ?: end;
+			opt_nlen = eq - opt;
+			eq++;
+			opt_vlen = next_opt - eq; /* will be -1 if no value */
+
+			tmp = opt_vlen >= 0 ? opt_vlen : 0;
+			kdebug("option '%*.*s' val '%*.*s'",
+			       opt_nlen, opt_nlen, opt, tmp, tmp, eq);
+
+			/* see if it's an error number representing a DNS error
+			 * that's to be recorded as the result in this key */
+			if (opt_nlen == sizeof(DNS_ERRORNO_OPTION) - 1 &&
+			    memcmp(opt, DNS_ERRORNO_OPTION, opt_nlen) == 0) {
+				kdebug("dns error number option");
+				if (opt_vlen <= 0)
+					goto bad_option_value;
+
+				ret = strict_strtoul(eq, 10, &derrno);
+				if (ret < 0)
+					goto bad_option_value;
+
+				if (derrno < 1 || derrno > 511)
+					goto bad_option_value;
+
+				kdebug("dns error no. = %lu", derrno);
+				key->type_data.x[0] = -derrno;
+				continue;
+			}
+
+		bad_option_value:
+			printk(KERN_WARNING
+			       "Option '%*.*s' to dns_resolver key %d:"
+			       " bad/missing value\n",
+			       opt_nlen, opt_nlen, opt, key->serial);
+			return -EINVAL;
+		} while (opt = next_opt + 1, opt < end);
+	}
+
+	/* don't cache the result if we're caching an error saying there's no
+	 * result */
+	if (key->type_data.x[0]) {
+		kleave(" = 0 [h_error %ld]", key->type_data.x[0]);
+		return 0;
+	}
+
+	kdebug("store result");
+	ret = key_payload_reserve(key, result_len);
+	if (ret < 0)
+		return -EINVAL;
+
+	upayload = kmalloc(sizeof(*upayload) + result_len + 1, GFP_KERNEL);
+	if (!upayload) {
+		kleave(" = -ENOMEM");
+		return -ENOMEM;
+	}
+
+	upayload->datalen = result_len;
+	memcpy(upayload->data, data, result_len);
+	upayload->data[result_len] = '\0';
+	rcu_assign_pointer(key->payload.data, upayload);
+
+	kleave(" = 0");
+	return 0;
+}
+
+/*
+ * The description is of the form "[<type>:]<domain_name>"
+ *
+ * The domain name may be a simple name or an absolute domain name (which
+ * should end with a period).  The domain name is case-independent.
+ */
+static int
+dns_resolver_match(const struct key *key, const void *description)
+{
+	int slen, dlen, ret = 0;
+	const char *src = key->description, *dsp = description;
+
+	kenter("%s,%s", src, dsp);
+
+	if (!src || !dsp)
+		goto no_match;
+
+	if (strcasecmp(src, dsp) == 0)
+		goto matched;
+
+	slen = strlen(src);
+	dlen = strlen(dsp);
+	if (slen <= 0 || dlen <= 0)
+		goto no_match;
+	if (src[slen - 1] == '.')
+		slen--;
+	if (dsp[dlen - 1] == '.')
+		dlen--;
+	if (slen != dlen || strncasecmp(src, dsp, slen) != 0)
+		goto no_match;
+
+matched:
+	ret = 1;
+no_match:
+	kleave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * Describe a DNS key
+ */
+static void dns_resolver_describe(const struct key *key, struct seq_file *m)
+{
+	int err = key->type_data.x[0];
+
+	seq_puts(m, key->description);
+	if (err)
+		seq_printf(m, ": %d", err);
+	else
+		seq_printf(m, ": %u", key->datalen);
+}
+
+struct key_type key_type_dns_resolver = {
+	.name		= "dns_resolver",
+	.instantiate	= dns_resolver_instantiate,
+	.match		= dns_resolver_match,
+	.revoke		= user_revoke,
+	.destroy	= user_destroy,
+	.describe	= dns_resolver_describe,
+	.read		= user_read,
+};
+
+static int __init init_dns_resolver(void)
+{
+	struct cred *cred;
+	struct key *keyring;
+	int ret;
+
+	printk(KERN_NOTICE "Registering the %s key type\n",
+	       key_type_dns_resolver.name);
+
+	/* create an override credential set with a special thread keyring in
+	 * which DNS requests are cached
+	 *
+	 * this is used to prevent malicious redirections from being installed
+	 * with add_key().
+	 */
+	cred = prepare_kernel_cred(NULL);
+	if (!cred)
+		return -ENOMEM;
+
+	keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred,
+			    (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+			    KEY_USR_VIEW | KEY_USR_READ,
+			    KEY_ALLOC_NOT_IN_QUOTA);
+	if (IS_ERR(keyring)) {
+		ret = PTR_ERR(keyring);
+		goto failed_put_cred;
+	}
+
+	ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		goto failed_put_key;
+
+	ret = register_key_type(&key_type_dns_resolver);
+	if (ret < 0)
+		goto failed_put_key;
+
+	/* instruct request_key() to use this special keyring as a cache for
+	 * the results it looks up */
+	cred->thread_keyring = keyring;
+	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+	dns_resolver_cache = cred;
+
+	kdebug("DNS resolver keyring: %d\n", key_serial(keyring));
+	return 0;
+
+failed_put_key:
+	key_put(keyring);
+failed_put_cred:
+	put_cred(cred);
+	return ret;
+}
+
+static void __exit exit_dns_resolver(void)
+{
+	key_revoke(dns_resolver_cache->thread_keyring);
+	unregister_key_type(&key_type_dns_resolver);
+	put_cred(dns_resolver_cache);
+	printk(KERN_NOTICE "Unregistered %s key type\n",
+	       key_type_dns_resolver.name);
+}
+
+module_init(init_dns_resolver)
+module_exit(exit_dns_resolver)
+MODULE_LICENSE("GPL");
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
new file mode 100644
index 00000000000..c32be292c7e
--- /dev/null
+++ b/net/dns_resolver/dns_query.c
@@ -0,0 +1,165 @@
+/* Upcall routine, designed to work as a key type and working through
+ * /sbin/request-key to contact userspace when handling DNS queries.
+ *
+ * See Documentation/networking/dns_resolver.txt
+ *
+ *   Copyright (c) 2007 Igor Mammedov
+ *   Author(s): Igor Mammedov (niallain@gmail.com)
+ *              Steve French (sfrench@us.ibm.com)
+ *              Wang Lei (wang840925@gmail.com)
+ *		David Howells (dhowells@redhat.com)
+ *
+ *   The upcall wrapper used to make an arbitrary DNS query.
+ *
+ *   This function requires the appropriate userspace tool dns.upcall to be
+ *   installed and something like the following lines should be added to the
+ *   /etc/request-key.conf file:
+ *
+ *	create dns_resolver * * /sbin/dns.upcall %k
+ *
+ *   For example to use this module to query AFSDB RR:
+ *
+ *	create dns_resolver afsdb:* * /sbin/dns.afsdb %k
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dns_resolver.h>
+#include <linux/err.h>
+#include <keys/dns_resolver-type.h>
+#include <keys/user-type.h>
+
+#include "internal.h"
+
+/**
+ * dns_query - Query the DNS
+ * @type: Query type (or NULL for straight host->IP lookup)
+ * @name: Name to look up
+ * @namelen: Length of name
+ * @options: Request options (or NULL if no options)
+ * @_result: Where to place the returned data.
+ * @_expiry: Where to store the result expiry time (or NULL)
+ *
+ * The data will be returned in the pointer at *result, and the caller is
+ * responsible for freeing it.
+ *
+ * The description should be of the form "[<query_type>:]<domain_name>", and
+ * the options need to be appropriate for the query type requested.  If no
+ * query_type is given, then the query is a straight hostname to IP address
+ * lookup.
+ *
+ * The DNS resolution lookup is performed by upcalling to userspace by way of
+ * requesting a key of type dns_resolver.
+ *
+ * Returns the size of the result on success, -ve error code otherwise.
+ */
+int dns_query(const char *type, const char *name, size_t namelen,
+	      const char *options, char **_result, time_t *_expiry)
+{
+	struct key *rkey;
+	struct user_key_payload *upayload;
+	const struct cred *saved_cred;
+	size_t typelen, desclen;
+	char *desc, *cp;
+	int ret, len;
+
+	kenter("%s,%*.*s,%zu,%s",
+	       type, (int)namelen, (int)namelen, name, namelen, options);
+
+	if (!name || namelen == 0 || !_result)
+		return -EINVAL;
+
+	/* construct the query key description as "[<type>:]<name>" */
+	typelen = 0;
+	desclen = 0;
+	if (type) {
+		typelen = strlen(type);
+		if (typelen < 1)
+			return -EINVAL;
+		desclen += typelen + 1;
+	}
+
+	if (!namelen)
+		namelen = strlen(name);
+	if (namelen < 3)
+		return -EINVAL;
+	desclen += namelen + 1;
+
+	desc = kmalloc(desclen, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+
+	cp = desc;
+	if (type) {
+		memcpy(cp, type, typelen);
+		cp += typelen;
+		*cp++ = ':';
+	}
+	memcpy(cp, name, namelen);
+	cp += namelen;
+	*cp = '\0';
+
+	if (!options)
+		options = "";
+	kdebug("call request_key(,%s,%s)", desc, options);
+
+	/* make the upcall, using special credentials to prevent the use of
+	 * add_key() to preinstall malicious redirections
+	 */
+	saved_cred = override_creds(dns_resolver_cache);
+	rkey = request_key(&key_type_dns_resolver, desc, options);
+	revert_creds(saved_cred);
+	kfree(desc);
+	if (IS_ERR(rkey)) {
+		ret = PTR_ERR(rkey);
+		goto out;
+	}
+
+	down_read(&rkey->sem);
+	rkey->perm |= KEY_USR_VIEW;
+
+	ret = key_validate(rkey);
+	if (ret < 0)
+		goto put;
+
+	/* If the DNS server gave an error, return that to the caller */
+	ret = rkey->type_data.x[0];
+	if (ret)
+		goto put;
+
+	upayload = rcu_dereference_protected(rkey->payload.data,
+					     lockdep_is_held(&rkey->sem));
+	len = upayload->datalen;
+
+	ret = -ENOMEM;
+	*_result = kmalloc(len + 1, GFP_KERNEL);
+	if (!*_result)
+		goto put;
+
+	memcpy(*_result, upayload->data, len + 1);
+	if (_expiry)
+		*_expiry = rkey->expiry;
+
+	ret = len;
+put:
+	up_read(&rkey->sem);
+	key_put(rkey);
+out:
+	kleave(" = %d", ret);
+	return ret;
+}
+EXPORT_SYMBOL(dns_query);
diff --git a/net/dns_resolver/internal.h b/net/dns_resolver/internal.h
new file mode 100644
index 00000000000..189ca9e9b78
--- /dev/null
+++ b/net/dns_resolver/internal.h
@@ -0,0 +1,44 @@
+/*
+ *   Copyright (c) 2010 Wang Lei
+ *   Author(s): Wang Lei (wang840925@gmail.com). All Rights Reserved.
+ *
+ *   Internal DNS Rsolver stuff
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+
+/*
+ * dns_key.c
+ */
+extern const struct cred *dns_resolver_cache;
+
+/*
+ * debug tracing
+ */
+extern unsigned dns_resolver_debug;
+
+#define	kdebug(FMT, ...)				\
+do {							\
+	if (unlikely(dns_resolver_debug))		\
+		printk(KERN_DEBUG "[%-6.6s] "FMT"\n",	\
+		       current->comm, ##__VA_ARGS__);	\
+} while (0)
+
+#define kenter(FMT, ...) kdebug("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) kdebug("<== %s()"FMT"", __func__, ##__VA_ARGS__)
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 11201784d29..87bb5f4de0e 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -1,7 +1,7 @@
 menuconfig NET_DSA
 	bool "Distributed Switch Architecture support"
 	default n
-	depends on EXPERIMENTAL && NET_ETHERNET && !S390
+	depends on EXPERIMENTAL && NETDEVICES && !S390
 	select PHYLIB
 	---help---
 	  This allows you to use hardware switch chips that use
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 8fdca56bb08..64ca2a6fa0d 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -164,10 +164,9 @@ out:
 static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct mii_ioctl_data *mii_data = if_mii(ifr);
 
 	if (p->phy != NULL)
-		return phy_mii_ioctl(p->phy, mii_data, cmd);
+		return phy_mii_ioctl(p->phy, ifr, cmd);
 
 	return -EOPNOTSUPP;
 }
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index 2a5a8053e00..f8c1ae4b41f 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -48,7 +48,7 @@
 
 static const struct proto_ops econet_ops;
 static struct hlist_head econet_sklist;
-static DEFINE_RWLOCK(econet_lock);
+static DEFINE_SPINLOCK(econet_lock);
 static DEFINE_MUTEX(econet_mutex);
 
 /* Since there are only 256 possible network numbers (or fewer, depends
@@ -98,16 +98,16 @@ struct ec_cb
 
 static void econet_remove_socket(struct hlist_head *list, struct sock *sk)
 {
-	write_lock_bh(&econet_lock);
+	spin_lock_bh(&econet_lock);
 	sk_del_node_init(sk);
-	write_unlock_bh(&econet_lock);
+	spin_unlock_bh(&econet_lock);
 }
 
 static void econet_insert_socket(struct hlist_head *list, struct sock *sk)
 {
-	write_lock_bh(&econet_lock);
+	spin_lock_bh(&econet_lock);
 	sk_add_node(sk, list);
-	write_unlock_bh(&econet_lock);
+	spin_unlock_bh(&econet_lock);
 }
 
 /*
@@ -392,7 +392,7 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
 		dev_queue_xmit(skb);
 		dev_put(dev);
 		mutex_unlock(&econet_mutex);
-		return(len);
+		return len;
 
 	out_free:
 		kfree_skb(skb);
@@ -637,7 +637,7 @@ static int econet_create(struct net *net, struct socket *sock, int protocol,
 	eo->num = protocol;
 
 	econet_insert_socket(&econet_sklist, sk);
-	return(0);
+	return 0;
 out:
 	return err;
 }
@@ -782,15 +782,19 @@ static struct sock *ec_listening_socket(unsigned char port, unsigned char
 	struct sock *sk;
 	struct hlist_node *node;
 
+	spin_lock(&econet_lock);
 	sk_for_each(sk, node, &econet_sklist) {
 		struct econet_sock *opt = ec_sk(sk);
 		if ((opt->port == port || opt->port == 0) &&
 		    (opt->station == station || opt->station == 0) &&
-		    (opt->net == net || opt->net == 0))
+		    (opt->net == net || opt->net == 0)) {
+			sock_hold(sk);
 			goto found;
+		}
 	}
 	sk = NULL;
 found:
+	spin_unlock(&econet_lock);
 	return sk;
 }
 
@@ -852,7 +856,7 @@ static void aun_incoming(struct sk_buff *skb, struct aunhdr *ah, size_t len)
 {
 	struct iphdr *ip = ip_hdr(skb);
 	unsigned char stn = ntohl(ip->saddr) & 0xff;
-	struct sock *sk;
+	struct sock *sk = NULL;
 	struct sk_buff *newskb;
 	struct ec_device *edev = skb->dev->ec_ptr;
 
@@ -882,10 +886,13 @@ static void aun_incoming(struct sk_buff *skb, struct aunhdr *ah, size_t len)
 	}
 
 	aun_send_response(ip->saddr, ah->handle, 3, 0);
+	sock_put(sk);
 	return;
 
 bad:
 	aun_send_response(ip->saddr, ah->handle, 4, 0);
+	if (sk)
+		sock_put(sk);
 }
 
 /*
@@ -1002,7 +1009,6 @@ static int __init aun_udp_initialise(void)
 	struct sockaddr_in sin;
 
 	skb_queue_head_init(&aun_queue);
-	spin_lock_init(&aun_queue_lock);
 	setup_timer(&ab_cleanup_timer, ab_cleanup, 0);
 	ab_cleanup_timer.expires = jiffies + (HZ*2);
 	add_timer(&ab_cleanup_timer);
@@ -1050,7 +1056,7 @@ release:
 static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct ec_framehdr *hdr;
-	struct sock *sk;
+	struct sock *sk = NULL;
 	struct ec_device *edev = dev->ec_ptr;
 
 	if (!net_eq(dev_net(dev), &init_net))
@@ -1085,10 +1091,12 @@ static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet
 	if (ec_queue_packet(sk, skb, edev->net, hdr->src_stn, hdr->cb,
 			    hdr->port))
 		goto drop;
-
+	sock_put(sk);
 	return NET_RX_SUCCESS;
 
 drop:
+	if (sk)
+		sock_put(sk);
 	kfree_skb(skb);
 	return NET_RX_DROP;
 }
@@ -1158,7 +1166,6 @@ static int __init econet_proto_init(void)
 		goto out;
 	sock_register(&econet_family_ops);
 #ifdef CONFIG_ECONET_AUNUDP
-	spin_lock_init(&aun_queue_lock);
 	aun_udp_initialise();
 #endif
 #ifdef CONFIG_ECONET_NATIVE
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 61ec0329316..f00ef2f1d81 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -158,7 +158,6 @@ EXPORT_SYMBOL(eth_rebuild_header);
 __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ethhdr *eth;
-	unsigned char *rawp;
 
 	skb->dev = dev;
 	skb_reset_mac_header(skb);
@@ -199,15 +198,13 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
 	if (ntohs(eth->h_proto) >= 1536)
 		return eth->h_proto;
 
-	rawp = skb->data;
-
 	/*
 	 *      This is a magic hack to spot IPX packets. Older Novell breaks
 	 *      the protocol design and runs IPX over 802.3 without an 802.2 LLC
 	 *      layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
 	 *      won't work for fault tolerant netware but does for the rest.
 	 */
-	if (*(unsigned short *)rawp == 0xFFFF)
+	if (skb->len >= 2 && *(unsigned short *)(skb->data) == 0xFFFF)
 		return htons(ETH_P_802_3);
 
 	/*
@@ -370,7 +367,7 @@ struct net_device *alloc_etherdev_mq(int sizeof_priv, unsigned int queue_count)
 EXPORT_SYMBOL(alloc_etherdev_mq);
 
 static size_t _format_mac_addr(char *buf, int buflen,
-				const unsigned char *addr, int len)
+			       const unsigned char *addr, int len)
 {
 	int i;
 	char *cp = buf;
@@ -379,7 +376,7 @@ static size_t _format_mac_addr(char *buf, int buflen,
 		cp += scnprintf(cp, buflen - (cp - buf), "%02x", addr[i]);
 		if (i == len - 1)
 			break;
-		cp += strlcpy(cp, ":", buflen - (cp - buf));
+		cp += scnprintf(cp, buflen - (cp - buf), ":");
 	}
 	return cp - buf;
 }
@@ -389,7 +386,7 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
 	size_t l;
 
 	l = _format_mac_addr(buf, PAGE_SIZE, addr, len);
-	l += strlcpy(buf + l, "\n", PAGE_SIZE - l);
-	return ((ssize_t) l);
+	l += scnprintf(buf + l, PAGE_SIZE - l, "\n");
+	return (ssize_t)l;
 }
 EXPORT_SYMBOL(sysfs_format_mac);
diff --git a/net/ethernet/pe2.c b/net/ethernet/pe2.c
index eb00796758c..85d574addbc 100644
--- a/net/ethernet/pe2.c
+++ b/net/ethernet/pe2.c
@@ -28,11 +28,10 @@ struct datalink_proto *make_EII_client(void)
 
 	return proto;
 }
+EXPORT_SYMBOL(make_EII_client);
 
 void destroy_EII_client(struct datalink_proto *dl)
 {
 	kfree(dl);
 }
-
 EXPORT_SYMBOL(destroy_EII_client);
-EXPORT_SYMBOL(make_EII_client);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7c3a7d19124..9e95d7fb6d5 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -46,7 +46,7 @@ config IP_ADVANCED_ROUTER
 	  rp_filter on use:
 
 	  echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
-	   and
+	   or
 	  echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
 
 	  Note that some distributions enable it in startup scripts.
@@ -84,7 +84,7 @@ config IP_FIB_TRIE
 
 	  An experimental study of compression methods for dynamic tries
 	  Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
-	  http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+	  <http://www.csc.kth.se/~snilsson/software/dyntrie2/>
 
 endchoice
 
@@ -215,8 +215,15 @@ config NET_IPIP
 	  be inserted in and removed from the running kernel whenever you
 	  want). Most people won't need this and can say N.
 
+config NET_IPGRE_DEMUX
+	tristate "IP: GRE demultiplexer"
+	help
+	 This is helper module to demultiplex GRE packets on GRE version field criteria.
+	 Required by ip_gre and pptp modules.
+
 config NET_IPGRE
 	tristate "IP: GRE tunnels over IP"
+	depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
 	help
 	  Tunneling means encapsulating data of one protocol type within
 	  another protocol and sending it over a channel that understands the
@@ -412,7 +419,7 @@ config INET_XFRM_MODE_BEET
 	  If unsure, say Y.
 
 config INET_LRO
-	bool "Large Receive Offload (ipv4/tcp)"
+	tristate "Large Receive Offload (ipv4/tcp)"
 	default y
 	---help---
 	  Support for Large Receive Offload (ipv4/tcp).
@@ -555,7 +562,7 @@ config TCP_CONG_VENO
 	distinguishing to circumvent the difficult judgment of the packet loss
 	type. TCP Veno cuts down less congestion window in response to random
 	loss packets.
-	See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
+	See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186> 
 
 config TCP_CONG_YEAH
 	tristate "YeAH TCP"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 80ff87ce43a..4978d22f9a7 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
 obj-$(CONFIG_IP_MROUTE) += ipmr.o
 obj-$(CONFIG_NET_IPIP) += ipip.o
+obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
 obj-$(CONFIG_NET_IPGRE) += ip_gre.o
 obj-$(CONFIG_SYN_COOKIES) += syncookies.o
 obj-$(CONFIG_INET_AH) += ah4.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 551ce564b03..f581f77d109 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -227,18 +227,16 @@ EXPORT_SYMBOL(inet_ehash_secret);
 
 /*
  * inet_ehash_secret must be set exactly once
- * Instead of using a dedicated spinlock, we (ab)use inetsw_lock
  */
 void build_ehash_secret(void)
 {
 	u32 rnd;
+
 	do {
 		get_random_bytes(&rnd, sizeof(rnd));
 	} while (rnd == 0);
-	spin_lock_bh(&inetsw_lock);
-	if (!inet_ehash_secret)
-		inet_ehash_secret = rnd;
-	spin_unlock_bh(&inetsw_lock);
+
+	cmpxchg(&inet_ehash_secret, 0, rnd);
 }
 EXPORT_SYMBOL(build_ehash_secret);
 
@@ -355,6 +353,8 @@ lookup_protocol:
 	inet = inet_sk(sk);
 	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
 
+	inet->nodefrag = 0;
+
 	if (SOCK_RAW == sock->type) {
 		inet->inet_num = protocol;
 		if (IPPROTO_RAW == protocol)
@@ -725,28 +725,31 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	sock_rps_record_flow(sk);
 
 	/* We may need to bind the socket. */
-	if (!inet_sk(sk)->inet_num && inet_autobind(sk))
+	if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+	    inet_autobind(sk))
 		return -EAGAIN;
 
 	return sk->sk_prot->sendmsg(iocb, sk, msg, size);
 }
 EXPORT_SYMBOL(inet_sendmsg);
 
-static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
-			     size_t size, int flags)
+ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
+		      size_t size, int flags)
 {
 	struct sock *sk = sock->sk;
 
 	sock_rps_record_flow(sk);
 
 	/* We may need to bind the socket. */
-	if (!inet_sk(sk)->inet_num && inet_autobind(sk))
+	if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+	    inet_autobind(sk))
 		return -EAGAIN;
 
 	if (sk->sk_prot->sendpage)
 		return sk->sk_prot->sendpage(sk, page, offset, size, flags);
 	return sock_no_sendpage(sock, page, offset, size, flags);
 }
+EXPORT_SYMBOL(inet_sendpage);
 
 int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 		 size_t size, int flags)
@@ -892,10 +895,10 @@ const struct proto_ops inet_stream_ops = {
 	.shutdown	   = inet_shutdown,
 	.setsockopt	   = sock_common_setsockopt,
 	.getsockopt	   = sock_common_getsockopt,
-	.sendmsg	   = tcp_sendmsg,
+	.sendmsg	   = inet_sendmsg,
 	.recvmsg	   = inet_recvmsg,
 	.mmap		   = sock_no_mmap,
-	.sendpage	   = tcp_sendpage,
+	.sendpage	   = inet_sendpage,
 	.splice_read	   = tcp_splice_read,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_sock_common_setsockopt,
@@ -1100,7 +1103,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 	if (err)
 		return err;
 
-	sk_setup_caps(sk, &rt->u.dst);
+	sk_setup_caps(sk, &rt->dst);
 
 	new_saddr = rt->rt_src;
 
@@ -1166,7 +1169,7 @@ int inet_sk_rebuild_header(struct sock *sk)
 	err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0);
 }
 	if (!err)
-		sk_setup_caps(sk, &rt->u.dst);
+		sk_setup_caps(sk, &rt->dst);
 	else {
 		/* Routing failed... */
 		sk->sk_route_caps = 0;
@@ -1425,13 +1428,49 @@ unsigned long snmp_fold_field(void __percpu *mib[], int offt)
 }
 EXPORT_SYMBOL_GPL(snmp_fold_field);
 
-int snmp_mib_init(void __percpu *ptr[2], size_t mibsize)
+#if BITS_PER_LONG==32
+
+u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
+{
+	u64 res = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		void *bhptr, *userptr;
+		struct u64_stats_sync *syncp;
+		u64 v_bh, v_user;
+		unsigned int start;
+
+		/* first mib used by softirq context, we must use _bh() accessors */
+		bhptr = per_cpu_ptr(SNMP_STAT_BHPTR(mib), cpu);
+		syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
+		do {
+			start = u64_stats_fetch_begin_bh(syncp);
+			v_bh = *(((u64 *) bhptr) + offt);
+		} while (u64_stats_fetch_retry_bh(syncp, start));
+
+		/* second mib used in USER context */
+		userptr = per_cpu_ptr(SNMP_STAT_USRPTR(mib), cpu);
+		syncp = (struct u64_stats_sync *)(userptr + syncp_offset);
+		do {
+			start = u64_stats_fetch_begin(syncp);
+			v_user = *(((u64 *) userptr) + offt);
+		} while (u64_stats_fetch_retry(syncp, start));
+
+		res += v_bh + v_user;
+	}
+	return res;
+}
+EXPORT_SYMBOL_GPL(snmp_fold_field64);
+#endif
+
+int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
 {
 	BUG_ON(ptr == NULL);
-	ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long));
+	ptr[0] = __alloc_percpu(mibsize, align);
 	if (!ptr[0])
 		goto err0;
-	ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long));
+	ptr[1] = __alloc_percpu(mibsize, align);
 	if (!ptr[1])
 		goto err1;
 	return 0;
@@ -1488,25 +1527,32 @@ static const struct net_protocol icmp_protocol = {
 static __net_init int ipv4_mib_init_net(struct net *net)
 {
 	if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
-			  sizeof(struct tcp_mib)) < 0)
+			  sizeof(struct tcp_mib),
+			  __alignof__(struct tcp_mib)) < 0)
 		goto err_tcp_mib;
 	if (snmp_mib_init((void __percpu **)net->mib.ip_statistics,
-			  sizeof(struct ipstats_mib)) < 0)
+			  sizeof(struct ipstats_mib),
+			  __alignof__(struct ipstats_mib)) < 0)
 		goto err_ip_mib;
 	if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
-			  sizeof(struct linux_mib)) < 0)
+			  sizeof(struct linux_mib),
+			  __alignof__(struct linux_mib)) < 0)
 		goto err_net_mib;
 	if (snmp_mib_init((void __percpu **)net->mib.udp_statistics,
-			  sizeof(struct udp_mib)) < 0)
+			  sizeof(struct udp_mib),
+			  __alignof__(struct udp_mib)) < 0)
 		goto err_udp_mib;
 	if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics,
-			  sizeof(struct udp_mib)) < 0)
+			  sizeof(struct udp_mib),
+			  __alignof__(struct udp_mib)) < 0)
 		goto err_udplite_mib;
 	if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics,
-			  sizeof(struct icmp_mib)) < 0)
+			  sizeof(struct icmp_mib),
+			  __alignof__(struct icmp_mib)) < 0)
 		goto err_icmp_mib;
 	if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics,
-			  sizeof(struct icmpmsg_mib)) < 0)
+			  sizeof(struct icmpmsg_mib),
+			  __alignof__(struct icmpmsg_mib)) < 0)
 		goto err_icmpmsg_mib;
 
 	tcp_mib_init(net);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index f094b75810d..d8e540c5b07 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -55,7 +55,7 @@
  *		Stuart Cheshire	:	Metricom and grat arp fixes
  *					*** FOR 2.1 clean this up ***
  *		Lawrence V. Stefani: (08/12/96) Added FDDI support.
- *		Alan Cox 	:	Took the AP1000 nasty FDDI hack and
+ *		Alan Cox	:	Took the AP1000 nasty FDDI hack and
  *					folded into the mainstream FDDI code.
  *					Ack spit, Linus how did you allow that
  *					one in...
@@ -116,17 +116,18 @@
 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
 #include <net/atmclip.h>
 struct neigh_table *clip_tbl_hook;
+EXPORT_SYMBOL(clip_tbl_hook);
 #endif
 
 #include <asm/system.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/netfilter_arp.h>
 
 /*
  *	Interface to generic neighbour cache.
  */
-static u32 arp_hash(const void *pkey, const struct net_device *dev);
+static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd);
 static int arp_constructor(struct neighbour *neigh);
 static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
 static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -160,7 +161,7 @@ static const struct neigh_ops arp_direct_ops = {
 	.queue_xmit =		dev_queue_xmit,
 };
 
-const struct neigh_ops arp_broken_ops = {
+static const struct neigh_ops arp_broken_ops = {
 	.family =		AF_INET,
 	.solicit =		arp_solicit,
 	.error_report =		arp_error_report,
@@ -171,33 +172,34 @@ const struct neigh_ops arp_broken_ops = {
 };
 
 struct neigh_table arp_tbl = {
-	.family =	AF_INET,
-	.entry_size =	sizeof(struct neighbour) + 4,
-	.key_len =	4,
-	.hash =		arp_hash,
-	.constructor =	arp_constructor,
-	.proxy_redo =	parp_redo,
-	.id =		"arp_cache",
-	.parms = {
-		.tbl =			&arp_tbl,
-		.base_reachable_time =	30 * HZ,
-		.retrans_time =	1 * HZ,
-		.gc_staletime =	60 * HZ,
-		.reachable_time =		30 * HZ,
-		.delay_probe_time =	5 * HZ,
-		.queue_len =		3,
-		.ucast_probes =	3,
-		.mcast_probes =	3,
-		.anycast_delay =	1 * HZ,
-		.proxy_delay =		(8 * HZ) / 10,
-		.proxy_qlen =		64,
-		.locktime =		1 * HZ,
+	.family		= AF_INET,
+	.entry_size	= sizeof(struct neighbour) + 4,
+	.key_len	= 4,
+	.hash		= arp_hash,
+	.constructor	= arp_constructor,
+	.proxy_redo	= parp_redo,
+	.id		= "arp_cache",
+	.parms		= {
+		.tbl			= &arp_tbl,
+		.base_reachable_time	= 30 * HZ,
+		.retrans_time		= 1 * HZ,
+		.gc_staletime		= 60 * HZ,
+		.reachable_time		= 30 * HZ,
+		.delay_probe_time	= 5 * HZ,
+		.queue_len		= 3,
+		.ucast_probes		= 3,
+		.mcast_probes		= 3,
+		.anycast_delay		= 1 * HZ,
+		.proxy_delay		= (8 * HZ) / 10,
+		.proxy_qlen		= 64,
+		.locktime		= 1 * HZ,
 	},
-	.gc_interval =	30 * HZ,
-	.gc_thresh1 =	128,
-	.gc_thresh2 =	512,
-	.gc_thresh3 =	1024,
+	.gc_interval	= 30 * HZ,
+	.gc_thresh1	= 128,
+	.gc_thresh2	= 512,
+	.gc_thresh3	= 1024,
 };
+EXPORT_SYMBOL(arp_tbl);
 
 int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
 {
@@ -223,14 +225,16 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
 }
 
 
-static u32 arp_hash(const void *pkey, const struct net_device *dev)
+static u32 arp_hash(const void *pkey,
+		    const struct net_device *dev,
+		    __u32 hash_rnd)
 {
-	return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd);
+	return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd);
 }
 
 static int arp_constructor(struct neighbour *neigh)
 {
-	__be32 addr = *(__be32*)neigh->primary_key;
+	__be32 addr = *(__be32 *)neigh->primary_key;
 	struct net_device *dev = neigh->dev;
 	struct in_device *in_dev;
 	struct neigh_parms *parms;
@@ -293,16 +297,19 @@ static int arp_constructor(struct neighbour *neigh)
 			neigh->ops = &arp_broken_ops;
 			neigh->output = neigh->ops->output;
 			return 0;
+#else
+			break;
 #endif
-		;}
+		}
 #endif
 		if (neigh->type == RTN_MULTICAST) {
 			neigh->nud_state = NUD_NOARP;
 			arp_mc_map(addr, neigh->ha, dev, 1);
-		} else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
+		} else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
 			neigh->nud_state = NUD_NOARP;
 			memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
-		} else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) {
+		} else if (neigh->type == RTN_BROADCAST ||
+			   (dev->flags & IFF_POINTOPOINT)) {
 			neigh->nud_state = NUD_NOARP;
 			memcpy(neigh->ha, dev->broadcast, dev->addr_len);
 		}
@@ -312,7 +319,7 @@ static int arp_constructor(struct neighbour *neigh)
 		else
 			neigh->ops = &arp_generic_ops;
 
-		if (neigh->nud_state&NUD_VALID)
+		if (neigh->nud_state & NUD_VALID)
 			neigh->output = neigh->ops->connected_output;
 		else
 			neigh->output = neigh->ops->output;
@@ -331,17 +338,21 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 	__be32 saddr = 0;
 	u8  *dst_ha = NULL;
 	struct net_device *dev = neigh->dev;
-	__be32 target = *(__be32*)neigh->primary_key;
+	__be32 target = *(__be32 *)neigh->primary_key;
 	int probes = atomic_read(&neigh->probes);
-	struct in_device *in_dev = in_dev_get(dev);
+	struct in_device *in_dev;
 
-	if (!in_dev)
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(dev);
+	if (!in_dev) {
+		rcu_read_unlock();
 		return;
-
+	}
 	switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
 	default:
 	case 0:		/* By default announce any local IP */
-		if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL)
+		if (skb && inet_addr_type(dev_net(dev),
+					  ip_hdr(skb)->saddr) == RTN_LOCAL)
 			saddr = ip_hdr(skb)->saddr;
 		break;
 	case 1:		/* Restrict announcements of saddr in same subnet */
@@ -358,22 +369,26 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 	case 2:		/* Avoid secondary IPs, get a primary/preferred one */
 		break;
 	}
+	rcu_read_unlock();
 
-	if (in_dev)
-		in_dev_put(in_dev);
 	if (!saddr)
 		saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
 
-	if ((probes -= neigh->parms->ucast_probes) < 0) {
-		if (!(neigh->nud_state&NUD_VALID))
-			printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
+	probes -= neigh->parms->ucast_probes;
+	if (probes < 0) {
+		if (!(neigh->nud_state & NUD_VALID))
+			printk(KERN_DEBUG
+			       "trying to ucast probe in NUD_INVALID\n");
 		dst_ha = neigh->ha;
 		read_lock_bh(&neigh->lock);
-	} else if ((probes -= neigh->parms->app_probes) < 0) {
+	} else {
+		probes -= neigh->parms->app_probes;
+		if (probes < 0) {
 #ifdef CONFIG_ARPD
-		neigh_app_ns(neigh);
+			neigh_app_ns(neigh);
 #endif
-		return;
+			return;
+		}
 	}
 
 	arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
@@ -427,7 +442,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
 
 	if (ip_route_output_key(net, &rt, &fl) < 0)
 		return 1;
-	if (rt->u.dst.dev != dev) {
+	if (rt->dst.dev != dev) {
 		NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
 		flag = 1;
 	}
@@ -446,7 +461,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
  *	is allowed to use this function, it is scheduled to be removed. --ANK
  */
 
-static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev)
+static int arp_set_predefined(int addr_hint, unsigned char *haddr,
+			      __be32 paddr, struct net_device *dev)
 {
 	switch (addr_hint) {
 	case RTN_LOCAL:
@@ -478,17 +494,16 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
 
 	paddr = skb_rtable(skb)->rt_gateway;
 
-	if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev))
+	if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
+			       paddr, dev))
 		return 0;
 
 	n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
 
 	if (n) {
 		n->used = jiffies;
-		if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
-			read_lock_bh(&n->lock);
-			memcpy(haddr, n->ha, dev->addr_len);
-			read_unlock_bh(&n->lock);
+		if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
+			neigh_ha_snapshot(haddr, n, dev);
 			neigh_release(n);
 			return 0;
 		}
@@ -497,6 +512,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
 		kfree_skb(skb);
 	return 1;
 }
+EXPORT_SYMBOL(arp_find);
 
 /* END OF OBSOLETE FUNCTIONS */
 
@@ -509,13 +525,14 @@ int arp_bind_neighbour(struct dst_entry *dst)
 		return -EINVAL;
 	if (n == NULL) {
 		__be32 nexthop = ((struct rtable *)dst)->rt_gateway;
-		if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
+		if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
 			nexthop = 0;
 		n = __neigh_lookup_errno(
 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
-		    dev->type == ARPHRD_ATM ? clip_tbl_hook :
+					 dev->type == ARPHRD_ATM ?
+					 clip_tbl_hook :
 #endif
-		    &arp_tbl, &nexthop, dev);
+					 &arp_tbl, &nexthop, dev);
 		if (IS_ERR(n))
 			return PTR_ERR(n);
 		dst->neighbour = n;
@@ -532,24 +549,24 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
 	struct in_device *out_dev;
 	int imi, omi = -1;
 
-	if (rt->u.dst.dev == dev)
+	if (rt->dst.dev == dev)
 		return 0;
 
 	if (!IN_DEV_PROXY_ARP(in_dev))
 		return 0;
-
-	if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0)
+	imi = IN_DEV_MEDIUM_ID(in_dev);
+	if (imi == 0)
 		return 1;
 	if (imi == -1)
 		return 0;
 
 	/* place to check for proxy_arp for routes */
 
-	if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) {
+	out_dev = __in_dev_get_rcu(rt->dst.dev);
+	if (out_dev)
 		omi = IN_DEV_MEDIUM_ID(out_dev);
-		in_dev_put(out_dev);
-	}
-	return (omi != imi && omi != -1);
+
+	return omi != imi && omi != -1;
 }
 
 /*
@@ -576,7 +593,7 @@ static inline int arp_fwd_pvlan(struct in_device *in_dev,
 				__be32 sip, __be32 tip)
 {
 	/* Private VLAN is only concerned about the same ethernet segment */
-	if (rt->u.dst.dev != dev)
+	if (rt->dst.dev != dev)
 		return 0;
 
 	/* Don't reply on self probes (often done by windowz boxes)*/
@@ -679,7 +696,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
 	arp->ar_pln = 4;
 	arp->ar_op = htons(type);
 
-	arp_ptr=(unsigned char *)(arp+1);
+	arp_ptr = (unsigned char *)(arp + 1);
 
 	memcpy(arp_ptr, src_hw, dev->addr_len);
 	arp_ptr += dev->addr_len;
@@ -698,6 +715,7 @@ out:
 	kfree_skb(skb);
 	return NULL;
 }
+EXPORT_SYMBOL(arp_create);
 
 /*
  *	Send an arp packet.
@@ -707,6 +725,7 @@ void arp_xmit(struct sk_buff *skb)
 	/* Send it off, maybe filter it using firewalling first.  */
 	NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
 }
+EXPORT_SYMBOL(arp_xmit);
 
 /*
  *	Create and send an arp packet.
@@ -727,12 +746,12 @@ void arp_send(int type, int ptype, __be32 dest_ip,
 
 	skb = arp_create(type, ptype, dest_ip, dev, src_ip,
 			 dest_hw, src_hw, target_hw);
-	if (skb == NULL) {
+	if (skb == NULL)
 		return;
-	}
 
 	arp_xmit(skb);
 }
+EXPORT_SYMBOL(arp_send);
 
 /*
  *	Process an arp request.
@@ -741,7 +760,7 @@ void arp_send(int type, int ptype, __be32 dest_ip,
 static int arp_process(struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dev;
-	struct in_device *in_dev = in_dev_get(dev);
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
 	struct arphdr *arp;
 	unsigned char *arp_ptr;
 	struct rtable *rt;
@@ -806,7 +825,7 @@ static int arp_process(struct sk_buff *skb)
 /*
  *	Extract fields
  */
-	arp_ptr= (unsigned char *)(arp+1);
+	arp_ptr = (unsigned char *)(arp + 1);
 	sha	= arp_ptr;
 	arp_ptr += dev->addr_len;
 	memcpy(&sip, arp_ptr, 4);
@@ -860,16 +879,17 @@ static int arp_process(struct sk_buff *skb)
 		addr_type = rt->rt_type;
 
 		if (addr_type == RTN_LOCAL) {
-			int dont_send = 0;
+			int dont_send;
 
-			if (!dont_send)
-				dont_send |= arp_ignore(in_dev,sip,tip);
+			dont_send = arp_ignore(in_dev, sip, tip);
 			if (!dont_send && IN_DEV_ARPFILTER(in_dev))
-				dont_send |= arp_filter(sip,tip,dev);
+				dont_send |= arp_filter(sip, tip, dev);
 			if (!dont_send) {
 				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
 				if (n) {
-					arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+					arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+						 dev, tip, sha, dev->dev_addr,
+						 sha);
 					neigh_release(n);
 				}
 			}
@@ -878,8 +898,7 @@ static int arp_process(struct sk_buff *skb)
 			if (addr_type == RTN_UNICAST  &&
 			    (arp_fwd_proxy(in_dev, dev, rt) ||
 			     arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
-			     pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))
-			{
+			     pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
 				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
 				if (n)
 					neigh_release(n);
@@ -887,10 +906,12 @@ static int arp_process(struct sk_buff *skb)
 				if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
 				    skb->pkt_type == PACKET_HOST ||
 				    in_dev->arp_parms->proxy_delay == 0) {
-					arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+					arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+						 dev, tip, sha, dev->dev_addr,
+						 sha);
 				} else {
-					pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
-					in_dev_put(in_dev);
+					pneigh_enqueue(&arp_tbl,
+						       in_dev->arp_parms, skb);
 					return 0;
 				}
 				goto out;
@@ -931,13 +952,12 @@ static int arp_process(struct sk_buff *skb)
 		if (arp->ar_op != htons(ARPOP_REPLY) ||
 		    skb->pkt_type != PACKET_HOST)
 			state = NUD_STALE;
-		neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+		neigh_update(n, sha, state,
+			     override ? NEIGH_UPDATE_F_OVERRIDE : 0);
 		neigh_release(n);
 	}
 
 out:
-	if (in_dev)
-		in_dev_put(in_dev);
 	consume_skb(skb);
 	return 0;
 }
@@ -969,7 +989,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
 	    arp->ar_pln != 4)
 		goto freeskb;
 
-	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (skb == NULL)
 		goto out_of_mem;
 
 	memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
@@ -1013,7 +1034,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
 		return -EINVAL;
 	if (!dev && (r->arp_flags & ATF_COM)) {
 		dev = dev_getbyhwaddr(net, r->arp_ha.sa_family,
-				r->arp_ha.sa_data);
+				      r->arp_ha.sa_data);
 		if (!dev)
 			return -ENODEV;
 	}
@@ -1027,7 +1048,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
 }
 
 static int arp_req_set(struct net *net, struct arpreq *r,
-		struct net_device * dev)
+		       struct net_device *dev)
 {
 	__be32 ip;
 	struct neighbour *neigh;
@@ -1040,12 +1061,13 @@ static int arp_req_set(struct net *net, struct arpreq *r,
 	if (r->arp_flags & ATF_PERM)
 		r->arp_flags |= ATF_COM;
 	if (dev == NULL) {
-		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
-							 .tos = RTO_ONLINK } } };
-		struct rtable * rt;
-		if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
+		struct flowi fl = { .nl_u.ip4_u = { .daddr = ip,
+						    .tos = RTO_ONLINK } };
+		struct rtable *rt;
+		err = ip_route_output_key(net, &rt, &fl);
+		if (err != 0)
 			return err;
-		dev = rt->u.dst.dev;
+		dev = rt->dst.dev;
 		ip_rt_put(rt);
 		if (!dev)
 			return -EINVAL;
@@ -1077,9 +1099,9 @@ static int arp_req_set(struct net *net, struct arpreq *r,
 		unsigned state = NUD_STALE;
 		if (r->arp_flags & ATF_PERM)
 			state = NUD_PERMANENT;
-		err = neigh_update(neigh, (r->arp_flags&ATF_COM) ?
+		err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
 				   r->arp_ha.sa_data : NULL, state,
-				   NEIGH_UPDATE_F_OVERRIDE|
+				   NEIGH_UPDATE_F_OVERRIDE |
 				   NEIGH_UPDATE_F_ADMIN);
 		neigh_release(neigh);
 	}
@@ -1088,12 +1110,12 @@ static int arp_req_set(struct net *net, struct arpreq *r,
 
 static unsigned arp_state_to_flags(struct neighbour *neigh)
 {
-	unsigned flags = 0;
 	if (neigh->nud_state&NUD_PERMANENT)
-		flags = ATF_PERM|ATF_COM;
+		return ATF_PERM | ATF_COM;
 	else if (neigh->nud_state&NUD_VALID)
-		flags = ATF_COM;
-	return flags;
+		return ATF_COM;
+	else
+		return 0;
 }
 
 /*
@@ -1136,7 +1158,7 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r,
 }
 
 static int arp_req_delete(struct net *net, struct arpreq *r,
-		struct net_device * dev)
+			  struct net_device *dev)
 {
 	int err;
 	__be32 ip;
@@ -1147,12 +1169,13 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
 
 	ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
 	if (dev == NULL) {
-		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
-							 .tos = RTO_ONLINK } } };
-		struct rtable * rt;
-		if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
+		struct flowi fl = { .nl_u.ip4_u = { .daddr = ip,
+						    .tos = RTO_ONLINK } };
+		struct rtable *rt;
+		err = ip_route_output_key(net, &rt, &fl);
+		if (err != 0)
 			return err;
-		dev = rt->u.dst.dev;
+		dev = rt->dst.dev;
 		ip_rt_put(rt);
 		if (!dev)
 			return -EINVAL;
@@ -1160,7 +1183,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
 	err = -ENXIO;
 	neigh = neigh_lookup(&arp_tbl, &ip, dev);
 	if (neigh) {
-		if (neigh->nud_state&~NUD_NOARP)
+		if (neigh->nud_state & ~NUD_NOARP)
 			err = neigh_update(neigh, NULL, NUD_FAILED,
 					   NEIGH_UPDATE_F_OVERRIDE|
 					   NEIGH_UPDATE_F_ADMIN);
@@ -1180,24 +1203,24 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	struct net_device *dev = NULL;
 
 	switch (cmd) {
-		case SIOCDARP:
-		case SIOCSARP:
-			if (!capable(CAP_NET_ADMIN))
-				return -EPERM;
-		case SIOCGARP:
-			err = copy_from_user(&r, arg, sizeof(struct arpreq));
-			if (err)
-				return -EFAULT;
-			break;
-		default:
-			return -EINVAL;
+	case SIOCDARP:
+	case SIOCSARP:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+	case SIOCGARP:
+		err = copy_from_user(&r, arg, sizeof(struct arpreq));
+		if (err)
+			return -EFAULT;
+		break;
+	default:
+		return -EINVAL;
 	}
 
 	if (r.arp_pa.sa_family != AF_INET)
 		return -EPFNOSUPPORT;
 
 	if (!(r.arp_flags & ATF_PUBL) &&
-	    (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB)))
+	    (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
 		return -EINVAL;
 	if (!(r.arp_flags & ATF_NETMASK))
 		((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
@@ -1205,7 +1228,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	rtnl_lock();
 	if (r.arp_dev[0]) {
 		err = -ENODEV;
-		if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL)
+		dev = __dev_get_by_name(net, r.arp_dev);
+		if (dev == NULL)
 			goto out;
 
 		/* Mmmm... It is wrong... ARPHRD_NETROM==0 */
@@ -1237,7 +1261,8 @@ out:
 	return err;
 }
 
-static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+static int arp_netdev_event(struct notifier_block *this, unsigned long event,
+			    void *ptr)
 {
 	struct net_device *dev = ptr;
 
@@ -1305,12 +1330,13 @@ static char *ax2asc2(ax25_address *a, char *buf)
 	for (n = 0, s = buf; n < 6; n++) {
 		c = (a->ax25_call[n] >> 1) & 0x7F;
 
-		if (c != ' ') *s++ = c;
+		if (c != ' ')
+			*s++ = c;
 	}
 
 	*s++ = '-';
-
-	if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) {
+	n = (a->ax25_call[6] >> 1) & 0x0F;
+	if (n > 9) {
 		*s++ = '1';
 		n -= 10;
 	}
@@ -1319,10 +1345,9 @@ static char *ax2asc2(ax25_address *a, char *buf)
 	*s++ = '\0';
 
 	if (*buf == '\0' || *buf == '-')
-	   return "*";
+		return "*";
 
 	return buf;
-
 }
 #endif /* CONFIG_AX25 */
 
@@ -1402,10 +1427,10 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
 /* ------------------------------------------------------------------------ */
 
 static const struct seq_operations arp_seq_ops = {
-	.start  = arp_seq_start,
-	.next   = neigh_seq_next,
-	.stop   = neigh_seq_stop,
-	.show   = arp_seq_show,
+	.start	= arp_seq_start,
+	.next	= neigh_seq_next,
+	.stop	= neigh_seq_stop,
+	.show	= arp_seq_show,
 };
 
 static int arp_seq_open(struct inode *inode, struct file *file)
@@ -1453,14 +1478,3 @@ static int __init arp_proc_init(void)
 }
 
 #endif /* CONFIG_PROC_FS */
-
-EXPORT_SYMBOL(arp_broken_ops);
-EXPORT_SYMBOL(arp_find);
-EXPORT_SYMBOL(arp_create);
-EXPORT_SYMBOL(arp_xmit);
-EXPORT_SYMBOL(arp_send);
-EXPORT_SYMBOL(arp_tbl);
-
-#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
-EXPORT_SYMBOL(clip_tbl_hook);
-#endif
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 3a92a76ae41..094e150c626 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -9,7 +9,7 @@
  *
  * The CIPSO draft specification can be found in the kernel's Documentation
  * directory as well as the following URL:
- *   http://netlabel.sourceforge.net/files/draft-ietf-cipso-ipsecurity-01.txt
+ *   http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
  * The FIPS-188 specification can be found at the following URL:
  *   http://www.itl.nist.gov/fipspubs/fip188.htm
  *
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index fb2465811b4..174be6caa5c 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -62,16 +62,17 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	}
 	if (!inet->inet_saddr)
 		inet->inet_saddr = rt->rt_src;	/* Update source address */
-	if (!inet->inet_rcv_saddr)
+	if (!inet->inet_rcv_saddr) {
 		inet->inet_rcv_saddr = rt->rt_src;
+		if (sk->sk_prot->rehash)
+			sk->sk_prot->rehash(sk);
+	}
 	inet->inet_daddr = rt->rt_dst;
 	inet->inet_dport = usin->sin_port;
 	sk->sk_state = TCP_ESTABLISHED;
 	inet->inet_id = jiffies;
 
-	sk_dst_set(sk, &rt->u.dst);
-	return(0);
+	sk_dst_set(sk, &rt->dst);
+	return 0;
 }
-
 EXPORT_SYMBOL(ip4_datagram_connect);
-
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 382bc768ed5..dc94b0316b7 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -209,7 +209,7 @@ static void inetdev_destroy(struct in_device *in_dev)
 		inet_free_ifa(ifa);
 	}
 
-	dev->ip_ptr = NULL;
+	rcu_assign_pointer(dev->ip_ptr, NULL);
 
 	devinet_sysctl_unregister(in_dev);
 	neigh_parms_release(&arp_tbl, in_dev->arp_parms);
@@ -403,6 +403,9 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
 	return inet_insert_ifa(ifa);
 }
 
+/* Caller must hold RCU or RTNL :
+ * We dont take a reference on found in_device
+ */
 struct in_device *inetdev_by_index(struct net *net, int ifindex)
 {
 	struct net_device *dev;
@@ -411,7 +414,7 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex)
 	rcu_read_lock();
 	dev = dev_get_by_index_rcu(net, ifindex);
 	if (dev)
-		in_dev = in_dev_get(dev);
+		in_dev = rcu_dereference_rtnl(dev->ip_ptr);
 	rcu_read_unlock();
 	return in_dev;
 }
@@ -453,8 +456,6 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
 		goto errout;
 	}
 
-	__in_dev_put(in_dev);
-
 	for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
 	     ifap = &ifa->ifa_next) {
 		if (tb[IFA_LOCAL] &&
@@ -1059,7 +1060,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
 	switch (event) {
 	case NETDEV_REGISTER:
 		printk(KERN_DEBUG "inetdev_event: bug\n");
-		dev->ip_ptr = NULL;
+		rcu_assign_pointer(dev->ip_ptr, NULL);
 		break;
 	case NETDEV_UP:
 		if (!inetdev_valid_mtu(dev->mtu))
@@ -1081,6 +1082,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
 		}
 		ip_mc_up(in_dev);
 		/* fall through */
+	case NETDEV_NOTIFY_PEERS:
 	case NETDEV_CHANGEADDR:
 		/* Send gratuitous ARP to notify of link change */
 		if (IN_DEV_ARP_NOTIFY(in_dev)) {
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4f0ed458c88..eb6f69a8f27 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -147,34 +147,43 @@ static void fib_flush(struct net *net)
 		rt_cache_flush(net, -1);
 }
 
-/*
- *	Find the first device with a given source address.
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
  */
-
-struct net_device * ip_dev_find(struct net *net, __be32 addr)
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 {
-	struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
-	struct fib_result res;
+	struct flowi fl = {
+		.nl_u = {
+			.ip4_u = {
+				.daddr = addr
+			}
+		},
+		.flags = FLOWI_FLAG_MATCH_ANY_IIF
+	};
+	struct fib_result res = { 0 };
 	struct net_device *dev = NULL;
-	struct fib_table *local_table;
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-	res.r = NULL;
-#endif
 
-	local_table = fib_get_table(net, RT_TABLE_LOCAL);
-	if (!local_table || fib_table_lookup(local_table, &fl, &res))
+	rcu_read_lock();
+	if (fib_lookup(net, &fl, &res)) {
+		rcu_read_unlock();
 		return NULL;
+	}
 	if (res.type != RTN_LOCAL)
 		goto out;
 	dev = FIB_RES_DEV(res);
 
-	if (dev)
+	if (dev && devref)
 		dev_hold(dev);
 out:
-	fib_res_put(&res);
+	rcu_read_unlock();
 	return dev;
 }
+EXPORT_SYMBOL(__ip_dev_find);
 
 /*
  * Find address type as if only "dev" was present in the system. If
@@ -201,11 +210,12 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
 	local_table = fib_get_table(net, RT_TABLE_LOCAL);
 	if (local_table) {
 		ret = RTN_UNICAST;
-		if (!fib_table_lookup(local_table, &fl, &res)) {
+		rcu_read_lock();
+		if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
 			if (!dev || dev == res.fi->fib_dev)
 				ret = res.type;
-			fib_res_put(&res);
 		}
+		rcu_read_unlock();
 	}
 	return ret;
 }
@@ -214,40 +224,46 @@ unsigned int inet_addr_type(struct net *net, __be32 addr)
 {
 	return __inet_dev_addr_type(net, NULL, addr);
 }
+EXPORT_SYMBOL(inet_addr_type);
 
 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 				__be32 addr)
 {
-       return __inet_dev_addr_type(net, dev, addr);
+	return __inet_dev_addr_type(net, dev, addr);
 }
+EXPORT_SYMBOL(inet_dev_addr_type);
 
 /* Given (packet source, input interface) and optional (dst, oif, tos):
-   - (main) check, that source is valid i.e. not broadcast or our local
-     address.
-   - figure out what "logical" interface this packet arrived
-     and calculate "specific destination" address.
-   - check, that packet arrived from expected physical interface.
+ * - (main) check, that source is valid i.e. not broadcast or our local
+ *   address.
+ * - figure out what "logical" interface this packet arrived
+ *   and calculate "specific destination" address.
+ * - check, that packet arrived from expected physical interface.
+ * called with rcu_read_lock()
  */
-
 int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 			struct net_device *dev, __be32 *spec_dst,
 			u32 *itag, u32 mark)
 {
 	struct in_device *in_dev;
-	struct flowi fl = { .nl_u = { .ip4_u =
-				      { .daddr = src,
-					.saddr = dst,
-					.tos = tos } },
-			    .mark = mark,
-			    .iif = oif };
-
+	struct flowi fl = {
+		.nl_u = {
+			.ip4_u = {
+				.daddr = src,
+				.saddr = dst,
+				.tos = tos
+			}
+		},
+		.mark = mark,
+		.iif = oif
+	};
 	struct fib_result res;
 	int no_addr, rpf, accept_local;
+	bool dev_match;
 	int ret;
 	struct net *net;
 
 	no_addr = rpf = accept_local = 0;
-	rcu_read_lock();
 	in_dev = __in_dev_get_rcu(dev);
 	if (in_dev) {
 		no_addr = in_dev->ifa_list == NULL;
@@ -256,7 +272,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 		if (mark && !IN_DEV_SRC_VMARK(in_dev))
 			fl.mark = 0;
 	}
-	rcu_read_unlock();
 
 	if (in_dev == NULL)
 		goto e_inval;
@@ -266,25 +281,33 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 		goto last_resort;
 	if (res.type != RTN_UNICAST) {
 		if (res.type != RTN_LOCAL || !accept_local)
-			goto e_inval_res;
+			goto e_inval;
 	}
 	*spec_dst = FIB_RES_PREFSRC(res);
 	fib_combine_itag(itag, &res);
+	dev_match = false;
+
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
+	for (ret = 0; ret < res.fi->fib_nhs; ret++) {
+		struct fib_nh *nh = &res.fi->fib_nh[ret];
+
+		if (nh->nh_dev == dev) {
+			dev_match = true;
+			break;
+		}
+	}
 #else
 	if (FIB_RES_DEV(res) == dev)
+		dev_match = true;
 #endif
-	{
+	if (dev_match) {
 		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
-		fib_res_put(&res);
 		return ret;
 	}
-	fib_res_put(&res);
 	if (no_addr)
 		goto last_resort;
 	if (rpf == 1)
-		goto e_inval;
+		goto e_rpf;
 	fl.oif = dev->ifindex;
 
 	ret = 0;
@@ -293,21 +316,20 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 			*spec_dst = FIB_RES_PREFSRC(res);
 			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
 		}
-		fib_res_put(&res);
 	}
 	return ret;
 
 last_resort:
 	if (rpf)
-		goto e_inval;
+		goto e_rpf;
 	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
 	*itag = 0;
 	return 0;
 
-e_inval_res:
-	fib_res_put(&res);
 e_inval:
 	return -EINVAL;
+e_rpf:
+	return -EXDEV;
 }
 
 static inline __be32 sk_extract_addr(struct sockaddr *addr)
@@ -456,9 +478,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
 }
 
 /*
- *	Handle IP routing ioctl calls. These are used to manipulate the routing tables
+ * Handle IP routing ioctl calls.
+ * These are used to manipulate the routing tables
  */
-
 int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 {
 	struct fib_config cfg;
@@ -502,7 +524,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	return -EINVAL;
 }
 
-const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
+const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_DST]		= { .type = NLA_U32 },
 	[RTA_SRC]		= { .type = NLA_U32 },
 	[RTA_IIF]		= { .type = NLA_U32 },
@@ -516,7 +538,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
 };
 
 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
-			    struct nlmsghdr *nlh, struct fib_config *cfg)
+			     struct nlmsghdr *nlh, struct fib_config *cfg)
 {
 	struct nlattr *attr;
 	int err, remaining;
@@ -671,12 +693,11 @@ out:
 }
 
 /* Prepare and feed intra-kernel routing request.
-   Really, it should be netlink message, but :-( netlink
-   can be not configured, so that we feed it directly
-   to fib engine. It is legal, because all events occur
-   only when netlink is already locked.
+ * Really, it should be netlink message, but :-( netlink
+ * can be not configured, so that we feed it directly
+ * to fib engine. It is legal, because all events occur
+ * only when netlink is already locked.
  */
-
 static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
 {
 	struct net *net = dev_net(ifa->ifa_dev->dev);
@@ -722,9 +743,9 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
 	struct in_ifaddr *prim = ifa;
 	__be32 mask = ifa->ifa_mask;
 	__be32 addr = ifa->ifa_local;
-	__be32 prefix = ifa->ifa_address&mask;
+	__be32 prefix = ifa->ifa_address & mask;
 
-	if (ifa->ifa_flags&IFA_F_SECONDARY) {
+	if (ifa->ifa_flags & IFA_F_SECONDARY) {
 		prim = inet_ifa_byprefix(in_dev, prefix, mask);
 		if (prim == NULL) {
 			printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
@@ -734,22 +755,24 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
 
 	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
 
-	if (!(dev->flags&IFF_UP))
+	if (!(dev->flags & IFF_UP))
 		return;
 
 	/* Add broadcast address, if it is explicitly assigned. */
 	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
 		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
 
-	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
 	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
-		fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
-			  RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
+		fib_magic(RTM_NEWROUTE,
+			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+			  prefix, ifa->ifa_prefixlen, prim);
 
 		/* Add network specific broadcasts, when it takes a sense */
 		if (ifa->ifa_prefixlen < 31) {
 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
-			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
+			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
+				  32, prim);
 		}
 	}
 }
@@ -760,17 +783,18 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 	struct net_device *dev = in_dev->dev;
 	struct in_ifaddr *ifa1;
 	struct in_ifaddr *prim = ifa;
-	__be32 brd = ifa->ifa_address|~ifa->ifa_mask;
-	__be32 any = ifa->ifa_address&ifa->ifa_mask;
+	__be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
+	__be32 any = ifa->ifa_address & ifa->ifa_mask;
 #define LOCAL_OK	1
 #define BRD_OK		2
 #define BRD0_OK		4
 #define BRD1_OK		8
 	unsigned ok = 0;
 
-	if (!(ifa->ifa_flags&IFA_F_SECONDARY))
-		fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
-			  RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
+	if (!(ifa->ifa_flags & IFA_F_SECONDARY))
+		fib_magic(RTM_DELROUTE,
+			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+			  any, ifa->ifa_prefixlen, prim);
 	else {
 		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
 		if (prim == NULL) {
@@ -780,9 +804,9 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 	}
 
 	/* Deletion is more complicated than add.
-	   We should take care of not to delete too much :-)
-
-	   Scan address list to be sure that addresses are really gone.
+	 * We should take care of not to delete too much :-)
+	 *
+	 * Scan address list to be sure that addresses are really gone.
 	 */
 
 	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
@@ -796,23 +820,23 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 			ok |= BRD0_OK;
 	}
 
-	if (!(ok&BRD_OK))
+	if (!(ok & BRD_OK))
 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
-	if (!(ok&BRD1_OK))
+	if (!(ok & BRD1_OK))
 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
-	if (!(ok&BRD0_OK))
+	if (!(ok & BRD0_OK))
 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
-	if (!(ok&LOCAL_OK)) {
+	if (!(ok & LOCAL_OK)) {
 		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
 
 		/* Check, that this local address finally disappeared. */
 		if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
 			/* And the last, but not the least thing.
-			   We must flush stray FIB entries.
-
-			   First of all, we scan fib_info list searching
-			   for stray nexthop entries, then ignite fib_flush.
-			*/
+			 * We must flush stray FIB entries.
+			 *
+			 * First of all, we scan fib_info list searching
+			 * for stray nexthop entries, then ignite fib_flush.
+			 */
 			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
 				fib_flush(dev_net(dev));
 		}
@@ -823,14 +847,20 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 #undef BRD1_OK
 }
 
-static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
+static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
 {
 
 	struct fib_result       res;
-	struct flowi            fl = { .mark = frn->fl_mark,
-				       .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
-							    .tos = frn->fl_tos,
-							    .scope = frn->fl_scope } } };
+	struct flowi            fl = {
+		.mark = frn->fl_mark,
+		.nl_u = {
+			.ip4_u = {
+				.daddr = frn->fl_addr,
+				.tos = frn->fl_tos,
+				.scope = frn->fl_scope
+			}
+		}
+	};
 
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	res.r = NULL;
@@ -841,15 +871,16 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
 		local_bh_disable();
 
 		frn->tb_id = tb->tb_id;
-		frn->err = fib_table_lookup(tb, &fl, &res);
+		rcu_read_lock();
+		frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF);
 
 		if (!frn->err) {
 			frn->prefixlen = res.prefixlen;
 			frn->nh_sel = res.nh_sel;
 			frn->type = res.type;
 			frn->scope = res.scope;
-			fib_res_put(&res);
 		}
+		rcu_read_unlock();
 		local_bh_enable();
 	}
 }
@@ -878,8 +909,8 @@ static void nl_fib_input(struct sk_buff *skb)
 
 	nl_fib_lookup(frn, tb);
 
-	pid = NETLINK_CB(skb).pid;       /* pid of sending process */
-	NETLINK_CB(skb).pid = 0;         /* from kernel */
+	pid = NETLINK_CB(skb).pid;      /* pid of sending process */
+	NETLINK_CB(skb).pid = 0;        /* from kernel */
 	NETLINK_CB(skb).dst_group = 0;  /* unicast */
 	netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
 }
@@ -926,7 +957,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 		fib_del_ifaddr(ifa);
 		if (ifa->ifa_dev->ifa_list == NULL) {
 			/* Last address was deleted from this interface.
-			   Disable IP.
+			 * Disable IP.
 			 */
 			fib_disable_ip(dev, 1, 0);
 		} else {
@@ -985,16 +1016,15 @@ static struct notifier_block fib_netdev_notifier = {
 static int __net_init ip_fib_net_init(struct net *net)
 {
 	int err;
-	unsigned int i;
+	size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
 
-	net->ipv4.fib_table_hash = kzalloc(
-			sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL);
+	/* Avoid false sharing : Use at least a full cache line */
+	size = max_t(size_t, size, L1_CACHE_BYTES);
+
+	net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
 	if (net->ipv4.fib_table_hash == NULL)
 		return -ENOMEM;
 
-	for (i = 0; i < FIB_TABLE_HASHSZ; i++)
-		INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]);
-
 	err = fib4_rules_init(net);
 	if (err < 0)
 		goto fail;
@@ -1022,7 +1052,7 @@ static void ip_fib_net_exit(struct net *net)
 		hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
 			hlist_del(node);
 			fib_table_flush(tb);
-			kfree(tb);
+			fib_free_table(tb);
 		}
 	}
 	kfree(net->ipv4.fib_table_hash);
@@ -1075,7 +1105,3 @@ void __init ip_fib_init(void)
 
 	fib_hash_init();
 }
-
-EXPORT_SYMBOL(inet_addr_type);
-EXPORT_SYMBOL(inet_dev_addr_type);
-EXPORT_SYMBOL(ip_dev_find);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 4ed7e0dea1b..b3acb0417b2 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -54,36 +54,37 @@ struct fib_node {
 	struct fib_alias        fn_embedded_alias;
 };
 
-struct fn_zone {
-	struct fn_zone		*fz_next;	/* Next not empty zone	*/
-	struct hlist_head	*fz_hash;	/* Hash table pointer	*/
-	int			fz_nent;	/* Number of entries	*/
+#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
 
-	int			fz_divisor;	/* Hash divisor		*/
+struct fn_zone {
+	struct fn_zone __rcu	*fz_next;	/* Next not empty zone	*/
+	struct hlist_head __rcu	*fz_hash;	/* Hash table pointer	*/
+	seqlock_t		fz_lock;
 	u32			fz_hashmask;	/* (fz_divisor - 1)	*/
-#define FZ_HASHMASK(fz)		((fz)->fz_hashmask)
 
-	int			fz_order;	/* Zone order		*/
-	__be32			fz_mask;
+	u8			fz_order;	/* Zone order (0..32)	*/
+	u8			fz_revorder;	/* 32 - fz_order	*/
+	__be32			fz_mask;	/* inet_make_mask(order) */
 #define FZ_MASK(fz)		((fz)->fz_mask)
-};
 
-/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
- * can be cheaper than memory lookup, so that FZ_* macros are used.
- */
+	struct hlist_head	fz_embedded_hash[EMBEDDED_HASH_SIZE];
+
+	int			fz_nent;	/* Number of entries	*/
+	int			fz_divisor;	/* Hash size (mask+1)	*/
+};
 
 struct fn_hash {
-	struct fn_zone	*fn_zones[33];
-	struct fn_zone	*fn_zone_list;
+	struct fn_zone		*fn_zones[33];
+	struct fn_zone __rcu	*fn_zone_list;
 };
 
 static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
 {
-	u32 h = ntohl(key)>>(32 - fz->fz_order);
+	u32 h = ntohl(key) >> fz->fz_revorder;
 	h ^= (h>>20);
 	h ^= (h>>10);
 	h ^= (h>>5);
-	h &= FZ_HASHMASK(fz);
+	h &= fz->fz_hashmask;
 	return h;
 }
 
@@ -92,7 +93,6 @@ static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
 	return dst & FZ_MASK(fz);
 }
 
-static DEFINE_RWLOCK(fib_hash_lock);
 static unsigned int fib_hash_genid;
 
 #define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
@@ -101,12 +101,11 @@ static struct hlist_head *fz_hash_alloc(int divisor)
 {
 	unsigned long size = divisor * sizeof(struct hlist_head);
 
-	if (size <= PAGE_SIZE) {
+	if (size <= PAGE_SIZE)
 		return kzalloc(size, GFP_KERNEL);
-	} else {
-		return (struct hlist_head *)
-			__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
-	}
+
+	return (struct hlist_head *)
+		__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
 }
 
 /* The fib hash lock must be held when this is called. */
@@ -123,10 +122,11 @@ static inline void fn_rebuild_zone(struct fn_zone *fz,
 		hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
 			struct hlist_head *new_head;
 
-			hlist_del(&f->fn_hash);
+			hlist_del_rcu(&f->fn_hash);
 
-			new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
-			hlist_add_head(&f->fn_hash, new_head);
+			new_head = rcu_dereference_protected(fz->fz_hash, 1) +
+				   fn_hash(f->fn_key, fz);
+			hlist_add_head_rcu(&f->fn_hash, new_head);
 		}
 	}
 }
@@ -147,14 +147,14 @@ static void fn_rehash_zone(struct fn_zone *fz)
 	int old_divisor, new_divisor;
 	u32 new_hashmask;
 
-	old_divisor = fz->fz_divisor;
+	new_divisor = old_divisor = fz->fz_divisor;
 
 	switch (old_divisor) {
-	case 16:
-		new_divisor = 256;
+	case EMBEDDED_HASH_SIZE:
+		new_divisor *= EMBEDDED_HASH_SIZE;
 		break;
-	case 256:
-		new_divisor = 1024;
+	case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
+		new_divisor *= (EMBEDDED_HASH_SIZE/2);
 		break;
 	default:
 		if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
@@ -175,31 +175,55 @@ static void fn_rehash_zone(struct fn_zone *fz)
 	ht = fz_hash_alloc(new_divisor);
 
 	if (ht)	{
-		write_lock_bh(&fib_hash_lock);
-		old_ht = fz->fz_hash;
-		fz->fz_hash = ht;
+		struct fn_zone nfz;
+
+		memcpy(&nfz, fz, sizeof(nfz));
+
+		write_seqlock_bh(&fz->fz_lock);
+		old_ht = rcu_dereference_protected(fz->fz_hash, 1);
+		RCU_INIT_POINTER(nfz.fz_hash, ht);
+		nfz.fz_hashmask = new_hashmask;
+		nfz.fz_divisor = new_divisor;
+		fn_rebuild_zone(&nfz, old_ht, old_divisor);
+		fib_hash_genid++;
+		rcu_assign_pointer(fz->fz_hash, ht);
 		fz->fz_hashmask = new_hashmask;
 		fz->fz_divisor = new_divisor;
-		fn_rebuild_zone(fz, old_ht, old_divisor);
-		fib_hash_genid++;
-		write_unlock_bh(&fib_hash_lock);
+		write_sequnlock_bh(&fz->fz_lock);
 
-		fz_hash_free(old_ht, old_divisor);
+		if (old_ht != fz->fz_embedded_hash) {
+			synchronize_rcu();
+			fz_hash_free(old_ht, old_divisor);
+		}
 	}
 }
 
-static inline void fn_free_node(struct fib_node * f)
+static void fn_free_node_rcu(struct rcu_head *head)
 {
+	struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
+
 	kmem_cache_free(fn_hash_kmem, f);
 }
 
+static inline void fn_free_node(struct fib_node *f)
+{
+	call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
+}
+
+static void fn_free_alias_rcu(struct rcu_head *head)
+{
+	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+
+	kmem_cache_free(fn_alias_kmem, fa);
+}
+
 static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
 {
 	fib_release_info(fa->fa_info);
 	if (fa == &f->fn_embedded_alias)
 		fa->fa_info = NULL;
 	else
-		kmem_cache_free(fn_alias_kmem, fa);
+		call_rcu(&fa->rcu, fn_free_alias_rcu);
 }
 
 static struct fn_zone *
@@ -210,68 +234,71 @@ fn_new_zone(struct fn_hash *table, int z)
 	if (!fz)
 		return NULL;
 
-	if (z) {
-		fz->fz_divisor = 16;
-	} else {
-		fz->fz_divisor = 1;
-	}
-	fz->fz_hashmask = (fz->fz_divisor - 1);
-	fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
-	if (!fz->fz_hash) {
-		kfree(fz);
-		return NULL;
-	}
+	seqlock_init(&fz->fz_lock);
+	fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
+	fz->fz_hashmask = fz->fz_divisor - 1;
+	RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
 	fz->fz_order = z;
+	fz->fz_revorder = 32 - z;
 	fz->fz_mask = inet_make_mask(z);
 
 	/* Find the first not empty zone with more specific mask */
-	for (i=z+1; i<=32; i++)
+	for (i = z + 1; i <= 32; i++)
 		if (table->fn_zones[i])
 			break;
-	write_lock_bh(&fib_hash_lock);
-	if (i>32) {
+	if (i > 32) {
 		/* No more specific masks, we are the first. */
-		fz->fz_next = table->fn_zone_list;
-		table->fn_zone_list = fz;
+		rcu_assign_pointer(fz->fz_next,
+				   rtnl_dereference(table->fn_zone_list));
+		rcu_assign_pointer(table->fn_zone_list, fz);
 	} else {
-		fz->fz_next = table->fn_zones[i]->fz_next;
-		table->fn_zones[i]->fz_next = fz;
+		rcu_assign_pointer(fz->fz_next,
+				   rtnl_dereference(table->fn_zones[i]->fz_next));
+		rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
 	}
 	table->fn_zones[z] = fz;
 	fib_hash_genid++;
-	write_unlock_bh(&fib_hash_lock);
 	return fz;
 }
 
 int fib_table_lookup(struct fib_table *tb,
-		     const struct flowi *flp, struct fib_result *res)
+		     const struct flowi *flp, struct fib_result *res,
+		     int fib_flags)
 {
 	int err;
 	struct fn_zone *fz;
 	struct fn_hash *t = (struct fn_hash *)tb->tb_data;
 
-	read_lock(&fib_hash_lock);
-	for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
+	rcu_read_lock();
+	for (fz = rcu_dereference(t->fn_zone_list);
+	     fz != NULL;
+	     fz = rcu_dereference(fz->fz_next)) {
 		struct hlist_head *head;
 		struct hlist_node *node;
 		struct fib_node *f;
-		__be32 k = fz_key(flp->fl4_dst, fz);
+		__be32 k;
+		unsigned int seq;
 
-		head = &fz->fz_hash[fn_hash(k, fz)];
-		hlist_for_each_entry(f, node, head, fn_hash) {
-			if (f->fn_key != k)
-				continue;
+		do {
+			seq = read_seqbegin(&fz->fz_lock);
+			k = fz_key(flp->fl4_dst, fz);
+
+			head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
+			hlist_for_each_entry_rcu(f, node, head, fn_hash) {
+				if (f->fn_key != k)
+					continue;
 
-			err = fib_semantic_match(&f->fn_alias,
+				err = fib_semantic_match(&f->fn_alias,
 						 flp, res,
-						 fz->fz_order);
-			if (err <= 0)
-				goto out;
-		}
+						 fz->fz_order, fib_flags);
+				if (err <= 0)
+					goto out;
+			}
+		} while (read_seqretry(&fz->fz_lock, seq));
 	}
 	err = 1;
 out:
-	read_unlock(&fib_hash_lock);
+	rcu_read_unlock();
 	return err;
 }
 
@@ -285,6 +312,7 @@ void fib_table_select_default(struct fib_table *tb,
 	struct fib_info *last_resort;
 	struct fn_hash *t = (struct fn_hash *)tb->tb_data;
 	struct fn_zone *fz = t->fn_zones[0];
+	struct hlist_head *head;
 
 	if (fz == NULL)
 		return;
@@ -293,11 +321,12 @@ void fib_table_select_default(struct fib_table *tb,
 	last_resort = NULL;
 	order = -1;
 
-	read_lock(&fib_hash_lock);
-	hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
+	rcu_read_lock();
+	head = rcu_dereference(fz->fz_hash);
+	hlist_for_each_entry_rcu(f, node, head, fn_hash) {
 		struct fib_alias *fa;
 
-		list_for_each_entry(fa, &f->fn_alias, fa_list) {
+		list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
 			struct fib_info *next_fi = fa->fa_info;
 
 			if (fa->fa_scope != res->scope ||
@@ -309,7 +338,8 @@ void fib_table_select_default(struct fib_table *tb,
 			if (!next_fi->fib_nh[0].nh_gw ||
 			    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
 				continue;
-			fa->fa_state |= FA_S_ACCESSED;
+
+			fib_alias_accessed(fa);
 
 			if (fi == NULL) {
 				if (next_fi != res->fi)
@@ -341,25 +371,25 @@ void fib_table_select_default(struct fib_table *tb,
 		fib_result_assign(res, last_resort);
 	tb->tb_default = last_idx;
 out:
-	read_unlock(&fib_hash_lock);
+	rcu_read_unlock();
 }
 
 /* Insert node F to FZ. */
 static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
 {
-	struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
+	struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz);
 
-	hlist_add_head(&f->fn_hash, head);
+	hlist_add_head_rcu(&f->fn_hash, head);
 }
 
 /* Return the node in FZ matching KEY. */
 static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
 {
-	struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)];
+	struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz);
 	struct hlist_node *node;
 	struct fib_node *f;
 
-	hlist_for_each_entry(f, node, head, fn_hash) {
+	hlist_for_each_entry_rcu(f, node, head, fn_hash) {
 		if (f->fn_key == key)
 			return f;
 	}
@@ -367,6 +397,17 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
 	return NULL;
 }
 
+
+static struct fib_alias *fib_fast_alloc(struct fib_node *f)
+{
+	struct fib_alias *fa = &f->fn_embedded_alias;
+
+	if (fa->fa_info != NULL)
+		fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+	return fa;
+}
+
+/* Caller must hold RTNL. */
 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 {
 	struct fn_hash *table = (struct fn_hash *) tb->tb_data;
@@ -451,7 +492,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 		}
 
 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
-			struct fib_info *fi_drop;
 			u8 state;
 
 			fa = fa_first;
@@ -460,21 +500,25 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 					err = 0;
 				goto out;
 			}
-			write_lock_bh(&fib_hash_lock);
-			fi_drop = fa->fa_info;
-			fa->fa_info = fi;
-			fa->fa_type = cfg->fc_type;
-			fa->fa_scope = cfg->fc_scope;
+			err = -ENOBUFS;
+			new_fa = fib_fast_alloc(f);
+			if (new_fa == NULL)
+				goto out;
+
+			new_fa->fa_tos = fa->fa_tos;
+			new_fa->fa_info = fi;
+			new_fa->fa_type = cfg->fc_type;
+			new_fa->fa_scope = cfg->fc_scope;
 			state = fa->fa_state;
-			fa->fa_state &= ~FA_S_ACCESSED;
+			new_fa->fa_state = state & ~FA_S_ACCESSED;
 			fib_hash_genid++;
-			write_unlock_bh(&fib_hash_lock);
+			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
 
-			fib_release_info(fi_drop);
+			fn_free_alias(fa, f);
 			if (state & FA_S_ACCESSED)
 				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
-			rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id,
-				  &cfg->fc_nlinfo, NLM_F_REPLACE);
+			rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
+				  tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
 			return 0;
 		}
 
@@ -506,12 +550,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 		f = new_f;
 	}
 
-	new_fa = &f->fn_embedded_alias;
-	if (new_fa->fa_info != NULL) {
-		new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
-		if (new_fa == NULL)
-			goto out;
-	}
+	new_fa = fib_fast_alloc(f);
+	if (new_fa == NULL)
+		goto out;
+
 	new_fa->fa_info = fi;
 	new_fa->fa_tos = tos;
 	new_fa->fa_type = cfg->fc_type;
@@ -522,13 +564,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 	 * Insert new entry to the list.
 	 */
 
-	write_lock_bh(&fib_hash_lock);
 	if (new_f)
 		fib_insert_node(fz, new_f);
-	list_add_tail(&new_fa->fa_list,
+	list_add_tail_rcu(&new_fa->fa_list,
 		 (fa ? &fa->fa_list : &f->fn_alias));
 	fib_hash_genid++;
-	write_unlock_bh(&fib_hash_lock);
 
 	if (new_f)
 		fz->fz_nent++;
@@ -603,14 +643,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 			  tb->tb_id, &cfg->fc_nlinfo, 0);
 
 		kill_fn = 0;
-		write_lock_bh(&fib_hash_lock);
-		list_del(&fa->fa_list);
+		list_del_rcu(&fa->fa_list);
 		if (list_empty(&f->fn_alias)) {
-			hlist_del(&f->fn_hash);
+			hlist_del_rcu(&f->fn_hash);
 			kill_fn = 1;
 		}
 		fib_hash_genid++;
-		write_unlock_bh(&fib_hash_lock);
 
 		if (fa->fa_state & FA_S_ACCESSED)
 			rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
@@ -627,7 +665,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 
 static int fn_flush_list(struct fn_zone *fz, int idx)
 {
-	struct hlist_head *head = &fz->fz_hash[idx];
+	struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx;
 	struct hlist_node *node, *n;
 	struct fib_node *f;
 	int found = 0;
@@ -641,14 +679,12 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
 			struct fib_info *fi = fa->fa_info;
 
 			if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
-				write_lock_bh(&fib_hash_lock);
-				list_del(&fa->fa_list);
+				list_del_rcu(&fa->fa_list);
 				if (list_empty(&f->fn_alias)) {
-					hlist_del(&f->fn_hash);
+					hlist_del_rcu(&f->fn_hash);
 					kill_f = 1;
 				}
 				fib_hash_genid++;
-				write_unlock_bh(&fib_hash_lock);
 
 				fn_free_alias(fa, f);
 				found++;
@@ -662,13 +698,16 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
 	return found;
 }
 
+/* caller must hold RTNL. */
 int fib_table_flush(struct fib_table *tb)
 {
 	struct fn_hash *table = (struct fn_hash *) tb->tb_data;
 	struct fn_zone *fz;
 	int found = 0;
 
-	for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
+	for (fz = rtnl_dereference(table->fn_zone_list);
+	     fz != NULL;
+	     fz = rtnl_dereference(fz->fz_next)) {
 		int i;
 
 		for (i = fz->fz_divisor - 1; i >= 0; i--)
@@ -677,6 +716,24 @@ int fib_table_flush(struct fib_table *tb)
 	return found;
 }
 
+void fib_free_table(struct fib_table *tb)
+{
+	struct fn_hash *table = (struct fn_hash *) tb->tb_data;
+	struct fn_zone *fz, *next;
+
+	next = table->fn_zone_list;
+	while (next != NULL) {
+		fz = next;
+		next = fz->fz_next;
+
+		if (fz->fz_hash != fz->fz_embedded_hash)
+			fz_hash_free(fz->fz_hash, fz->fz_divisor);
+
+		kfree(fz);
+	}
+
+	kfree(tb);
+}
 
 static inline int
 fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
@@ -690,10 +747,10 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
 
 	s_i = cb->args[4];
 	i = 0;
-	hlist_for_each_entry(f, node, head, fn_hash) {
+	hlist_for_each_entry_rcu(f, node, head, fn_hash) {
 		struct fib_alias *fa;
 
-		list_for_each_entry(fa, &f->fn_alias, fa_list) {
+		list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
 			if (i < s_i)
 				goto next;
 
@@ -711,7 +768,7 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
 				cb->args[4] = i;
 				return -1;
 			}
-		next:
+next:
 			i++;
 		}
 	}
@@ -725,14 +782,15 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
 		   struct fn_zone *fz)
 {
 	int h, s_h;
+	struct hlist_head *head = rcu_dereference(fz->fz_hash);
 
-	if (fz->fz_hash == NULL)
+	if (head == NULL)
 		return skb->len;
 	s_h = cb->args[3];
 	for (h = s_h; h < fz->fz_divisor; h++) {
-		if (hlist_empty(&fz->fz_hash[h]))
+		if (hlist_empty(head + h))
 			continue;
-		if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h]) < 0) {
+		if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) {
 			cb->args[3] = h;
 			return -1;
 		}
@@ -746,23 +804,26 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 		   struct netlink_callback *cb)
 {
-	int m, s_m;
+	int m = 0, s_m;
 	struct fn_zone *fz;
 	struct fn_hash *table = (struct fn_hash *)tb->tb_data;
 
 	s_m = cb->args[2];
-	read_lock(&fib_hash_lock);
-	for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
-		if (m < s_m) continue;
+	rcu_read_lock();
+	for (fz = rcu_dereference(table->fn_zone_list);
+	     fz != NULL;
+	     fz = rcu_dereference(fz->fz_next), m++) {
+		if (m < s_m)
+			continue;
 		if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
 			cb->args[2] = m;
-			read_unlock(&fib_hash_lock);
+			rcu_read_unlock();
 			return -1;
 		}
 		memset(&cb->args[3], 0,
 		       sizeof(cb->args) - 3*sizeof(cb->args[0]));
 	}
-	read_unlock(&fib_hash_lock);
+	rcu_read_unlock();
 	cb->args[2] = m;
 	return skb->len;
 }
@@ -825,14 +886,15 @@ static struct fib_alias *fib_get_first(struct seq_file *seq)
 	iter->genid	= fib_hash_genid;
 	iter->valid	= 1;
 
-	for (iter->zone = table->fn_zone_list; iter->zone;
-	     iter->zone = iter->zone->fz_next) {
+	for (iter->zone = rcu_dereference(table->fn_zone_list);
+	     iter->zone != NULL;
+	     iter->zone = rcu_dereference(iter->zone->fz_next)) {
 		int maxslot;
 
 		if (!iter->zone->fz_nent)
 			continue;
 
-		iter->hash_head = iter->zone->fz_hash;
+		iter->hash_head = rcu_dereference(iter->zone->fz_hash);
 		maxslot = iter->zone->fz_divisor;
 
 		for (iter->bucket = 0; iter->bucket < maxslot;
@@ -911,13 +973,13 @@ static struct fib_alias *fib_get_next(struct seq_file *seq)
 			}
 		}
 
-		iter->zone = iter->zone->fz_next;
+		iter->zone = rcu_dereference(iter->zone->fz_next);
 
 		if (!iter->zone)
 			goto out;
 
 		iter->bucket = 0;
-		iter->hash_head = iter->zone->fz_hash;
+		iter->hash_head = rcu_dereference(iter->zone->fz_hash);
 
 		hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
 			list_for_each_entry(fa, &fn->fn_alias, fa_list) {
@@ -950,11 +1012,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
 }
 
 static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(fib_hash_lock)
+	__acquires(RCU)
 {
 	void *v = NULL;
 
-	read_lock(&fib_hash_lock);
+	rcu_read_lock();
 	if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
 		v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 	return v;
@@ -967,15 +1029,16 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void fib_seq_stop(struct seq_file *seq, void *v)
-	__releases(fib_hash_lock)
+	__releases(RCU)
 {
-	read_unlock(&fib_hash_lock);
+	rcu_read_unlock();
 }
 
 static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
 {
 	static const unsigned type2flags[RTN_MAX + 1] = {
-		[7] = RTF_REJECT, [8] = RTF_REJECT,
+		[7] = RTF_REJECT,
+		[8] = RTF_REJECT,
 	};
 	unsigned flags = type2flags[type];
 
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 637b133973b..a29edf2219c 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -12,17 +12,22 @@ struct fib_alias {
 	u8			fa_type;
 	u8			fa_scope;
 	u8			fa_state;
-#ifdef CONFIG_IP_FIB_TRIE
 	struct rcu_head		rcu;
-#endif
 };
 
 #define FA_S_ACCESSED	0x01
 
+/* Dont write on fa_state unless needed, to keep it shared on all cpus */
+static inline void fib_alias_accessed(struct fib_alias *fa)
+{
+	if (!(fa->fa_state & FA_S_ACCESSED))
+		fa->fa_state |= FA_S_ACCESSED;
+}
+
 /* Exported by fib_semantics.c */
 extern int fib_semantic_match(struct list_head *head,
 			      const struct flowi *flp,
-			      struct fib_result *res, int prefixlen);
+			      struct fib_result *res, int prefixlen, int fib_flags);
 extern void fib_release_info(struct fib_info *);
 extern struct fib_info *fib_create_info(struct fib_config *cfg);
 extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 76daeb5ff56..7981a24f5c7 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -6,7 +6,7 @@
  *		IPv4 Forwarding Information Base: policy rules.
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- * 		Thomas Graf <tgraf@suug.ch>
+ *		Thomas Graf <tgraf@suug.ch>
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -14,7 +14,7 @@
  *		2 of the License, or (at your option) any later version.
  *
  * Fixes:
- * 		Rani Assaf	:	local_rule cannot be deleted
+ *		Rani Assaf	:	local_rule cannot be deleted
  *		Marc Boucher	:	routing by fwmark
  */
 
@@ -32,8 +32,7 @@
 #include <net/ip_fib.h>
 #include <net/fib_rules.h>
 
-struct fib4_rule
-{
+struct fib4_rule {
 	struct fib_rule		common;
 	u8			dst_len;
 	u8			src_len;
@@ -58,6 +57,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
 {
 	struct fib_lookup_arg arg = {
 		.result = res,
+		.flags = FIB_LOOKUP_NOREF,
 	};
 	int err;
 
@@ -91,10 +91,11 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
 		goto errout;
 	}
 
-	if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL)
+	tbl = fib_get_table(rule->fr_net, rule->table);
+	if (!tbl)
 		goto errout;
 
-	err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result);
+	err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags);
 	if (err > 0)
 		err = -EAGAIN;
 errout:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 20f09c5b31e..3e0da3ef611 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -60,21 +60,30 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
 
 static DEFINE_SPINLOCK(fib_multipath_lock);
 
-#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
-for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
-
-#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \
-for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++)
+#define for_nexthops(fi) {						\
+	int nhsel; const struct fib_nh *nh;				\
+	for (nhsel = 0, nh = (fi)->fib_nh;				\
+	     nhsel < (fi)->fib_nhs;					\
+	     nh++, nhsel++)
+
+#define change_nexthops(fi) {						\
+	int nhsel; struct fib_nh *nexthop_nh;				\
+	for (nhsel = 0,	nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
+	     nhsel < (fi)->fib_nhs;					\
+	     nexthop_nh++, nhsel++)
 
 #else /* CONFIG_IP_ROUTE_MULTIPATH */
 
 /* Hope, that gcc will optimize it to get rid of dummy loop */
 
-#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
-for (nhsel=0; nhsel < 1; nhsel++)
+#define for_nexthops(fi) {						\
+	int nhsel; const struct fib_nh *nh = (fi)->fib_nh;		\
+	for (nhsel = 0; nhsel < 1; nhsel++)
 
-#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
-for (nhsel=0; nhsel < 1; nhsel++)
+#define change_nexthops(fi) {						\
+	int nhsel;							\
+	struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
+	for (nhsel = 0; nhsel < 1; nhsel++)
 
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 
@@ -86,63 +95,70 @@ static const struct
 	int	error;
 	u8	scope;
 } fib_props[RTN_MAX + 1] = {
-	{
+	[RTN_UNSPEC] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_NOWHERE,
-	},	/* RTN_UNSPEC */
-	{
+	},
+	[RTN_UNICAST] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_UNICAST */
-	{
+	},
+	[RTN_LOCAL] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_HOST,
-	},	/* RTN_LOCAL */
-	{
+	},
+	[RTN_BROADCAST] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_LINK,
-	},	/* RTN_BROADCAST */
-	{
+	},
+	[RTN_ANYCAST] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_LINK,
-	},	/* RTN_ANYCAST */
-	{
+	},
+	[RTN_MULTICAST] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_MULTICAST */
-	{
+	},
+	[RTN_BLACKHOLE] = {
 		.error	= -EINVAL,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_BLACKHOLE */
-	{
+	},
+	[RTN_UNREACHABLE] = {
 		.error	= -EHOSTUNREACH,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_UNREACHABLE */
-	{
+	},
+	[RTN_PROHIBIT] = {
 		.error	= -EACCES,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_PROHIBIT */
-	{
+	},
+	[RTN_THROW] = {
 		.error	= -EAGAIN,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_THROW */
-	{
+	},
+	[RTN_NAT] = {
 		.error	= -EINVAL,
 		.scope	= RT_SCOPE_NOWHERE,
-	},	/* RTN_NAT */
-	{
+	},
+	[RTN_XRESOLVE] = {
 		.error	= -EINVAL,
 		.scope	= RT_SCOPE_NOWHERE,
-	},	/* RTN_XRESOLVE */
+	},
 };
 
 
 /* Release a nexthop info record */
 
+static void free_fib_info_rcu(struct rcu_head *head)
+{
+	struct fib_info *fi = container_of(head, struct fib_info, rcu);
+
+	kfree(fi);
+}
+
 void free_fib_info(struct fib_info *fi)
 {
 	if (fi->fib_dead == 0) {
-		printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
+		pr_warning("Freeing alive fib_info %p\n", fi);
 		return;
 	}
 	change_nexthops(fi) {
@@ -152,7 +168,7 @@ void free_fib_info(struct fib_info *fi)
 	} endfor_nexthops(fi);
 	fib_info_cnt--;
 	release_net(fi->fib_net);
-	kfree(fi);
+	call_rcu(&fi->rcu, free_fib_info_rcu);
 }
 
 void fib_release_info(struct fib_info *fi)
@@ -173,7 +189,7 @@ void fib_release_info(struct fib_info *fi)
 	spin_unlock_bh(&fib_info_lock);
 }
 
-static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 {
 	const struct fib_nh *onh = ofi->fib_nh;
 
@@ -187,7 +203,7 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *
 #ifdef CONFIG_NET_CLS_ROUTE
 		    nh->nh_tclassid != onh->nh_tclassid ||
 #endif
-		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
 			return -1;
 		onh++;
 	} endfor_nexthops(fi);
@@ -238,7 +254,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
 		    nfi->fib_priority == fi->fib_priority &&
 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
 			   sizeof(fi->fib_metrics)) == 0 &&
-		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
+		    ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
 		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
 			return fi;
 	}
@@ -247,9 +263,8 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
 }
 
 /* Check, that the gateway is already configured.
-   Used only by redirect accept routine.
+ * Used only by redirect accept routine.
  */
-
 int ip_fib_check_default(__be32 gw, struct net_device *dev)
 {
 	struct hlist_head *head;
@@ -264,7 +279,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
 	hlist_for_each_entry(nh, node, head, nh_hash) {
 		if (nh->nh_dev == dev &&
 		    nh->nh_gw == gw &&
-		    !(nh->nh_flags&RTNH_F_DEAD)) {
+		    !(nh->nh_flags & RTNH_F_DEAD)) {
 			spin_unlock(&fib_info_lock);
 			return 0;
 		}
@@ -362,10 +377,10 @@ int fib_detect_death(struct fib_info *fi, int order,
 	}
 	if (state == NUD_REACHABLE)
 		return 0;
-	if ((state&NUD_VALID) && order != dflt)
+	if ((state & NUD_VALID) && order != dflt)
 		return 0;
-	if ((state&NUD_VALID) ||
-	    (*last_idx<0 && order > dflt)) {
+	if ((state & NUD_VALID) ||
+	    (*last_idx < 0 && order > dflt)) {
 		*last_resort = fi;
 		*last_idx = order;
 	}
@@ -476,75 +491,76 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 
 
 /*
-   Picture
-   -------
-
-   Semantics of nexthop is very messy by historical reasons.
-   We have to take into account, that:
-   a) gateway can be actually local interface address,
-      so that gatewayed route is direct.
-   b) gateway must be on-link address, possibly
-      described not by an ifaddr, but also by a direct route.
-   c) If both gateway and interface are specified, they should not
-      contradict.
-   d) If we use tunnel routes, gateway could be not on-link.
-
-   Attempt to reconcile all of these (alas, self-contradictory) conditions
-   results in pretty ugly and hairy code with obscure logic.
-
-   I chose to generalized it instead, so that the size
-   of code does not increase practically, but it becomes
-   much more general.
-   Every prefix is assigned a "scope" value: "host" is local address,
-   "link" is direct route,
-   [ ... "site" ... "interior" ... ]
-   and "universe" is true gateway route with global meaning.
-
-   Every prefix refers to a set of "nexthop"s (gw, oif),
-   where gw must have narrower scope. This recursion stops
-   when gw has LOCAL scope or if "nexthop" is declared ONLINK,
-   which means that gw is forced to be on link.
-
-   Code is still hairy, but now it is apparently logically
-   consistent and very flexible. F.e. as by-product it allows
-   to co-exists in peace independent exterior and interior
-   routing processes.
-
-   Normally it looks as following.
-
-   {universe prefix}  -> (gw, oif) [scope link]
-			  |
-			  |-> {link prefix} -> (gw, oif) [scope local]
-						|
-						|-> {local prefix} (terminal node)
+ * Picture
+ * -------
+ *
+ * Semantics of nexthop is very messy by historical reasons.
+ * We have to take into account, that:
+ * a) gateway can be actually local interface address,
+ *    so that gatewayed route is direct.
+ * b) gateway must be on-link address, possibly
+ *    described not by an ifaddr, but also by a direct route.
+ * c) If both gateway and interface are specified, they should not
+ *    contradict.
+ * d) If we use tunnel routes, gateway could be not on-link.
+ *
+ * Attempt to reconcile all of these (alas, self-contradictory) conditions
+ * results in pretty ugly and hairy code with obscure logic.
+ *
+ * I chose to generalized it instead, so that the size
+ * of code does not increase practically, but it becomes
+ * much more general.
+ * Every prefix is assigned a "scope" value: "host" is local address,
+ * "link" is direct route,
+ * [ ... "site" ... "interior" ... ]
+ * and "universe" is true gateway route with global meaning.
+ *
+ * Every prefix refers to a set of "nexthop"s (gw, oif),
+ * where gw must have narrower scope. This recursion stops
+ * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+ * which means that gw is forced to be on link.
+ *
+ * Code is still hairy, but now it is apparently logically
+ * consistent and very flexible. F.e. as by-product it allows
+ * to co-exists in peace independent exterior and interior
+ * routing processes.
+ *
+ * Normally it looks as following.
+ *
+ * {universe prefix}  -> (gw, oif) [scope link]
+ *		  |
+ *		  |-> {link prefix} -> (gw, oif) [scope local]
+ *					|
+ *					|-> {local prefix} (terminal node)
  */
-
 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 			struct fib_nh *nh)
 {
 	int err;
 	struct net *net;
+	struct net_device *dev;
 
 	net = cfg->fc_nlinfo.nl_net;
 	if (nh->nh_gw) {
 		struct fib_result res;
 
-		if (nh->nh_flags&RTNH_F_ONLINK) {
-			struct net_device *dev;
+		if (nh->nh_flags & RTNH_F_ONLINK) {
 
 			if (cfg->fc_scope >= RT_SCOPE_LINK)
 				return -EINVAL;
 			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
 				return -EINVAL;
-			if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
+			dev = __dev_get_by_index(net, nh->nh_oif);
+			if (!dev)
 				return -ENODEV;
-			if (!(dev->flags&IFF_UP))
+			if (!(dev->flags & IFF_UP))
 				return -ENETDOWN;
 			nh->nh_dev = dev;
 			dev_hold(dev);
 			nh->nh_scope = RT_SCOPE_LINK;
 			return 0;
 		}
+		rcu_read_lock();
 		{
 			struct flowi fl = {
 				.nl_u = {
@@ -559,50 +575,53 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 			/* It is not necessary, but requires a bit of thinking */
 			if (fl.fl4_scope < RT_SCOPE_LINK)
 				fl.fl4_scope = RT_SCOPE_LINK;
-			if ((err = fib_lookup(net, &fl, &res)) != 0)
+			err = fib_lookup(net, &fl, &res);
+			if (err) {
+				rcu_read_unlock();
 				return err;
+			}
 		}
 		err = -EINVAL;
 		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
 			goto out;
 		nh->nh_scope = res.scope;
 		nh->nh_oif = FIB_RES_OIF(res);
-		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
+		nh->nh_dev = dev = FIB_RES_DEV(res);
+		if (!dev)
 			goto out;
-		dev_hold(nh->nh_dev);
-		err = -ENETDOWN;
-		if (!(nh->nh_dev->flags & IFF_UP))
-			goto out;
-		err = 0;
-out:
-		fib_res_put(&res);
-		return err;
+		dev_hold(dev);
+		err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
 	} else {
 		struct in_device *in_dev;
 
-		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
+		if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
 			return -EINVAL;
 
+		rcu_read_lock();
+		err = -ENODEV;
 		in_dev = inetdev_by_index(net, nh->nh_oif);
 		if (in_dev == NULL)
-			return -ENODEV;
-		if (!(in_dev->dev->flags&IFF_UP)) {
-			in_dev_put(in_dev);
-			return -ENETDOWN;
-		}
+			goto out;
+		err = -ENETDOWN;
+		if (!(in_dev->dev->flags & IFF_UP))
+			goto out;
 		nh->nh_dev = in_dev->dev;
 		dev_hold(nh->nh_dev);
 		nh->nh_scope = RT_SCOPE_HOST;
-		in_dev_put(in_dev);
+		err = 0;
 	}
-	return 0;
+out:
+	rcu_read_unlock();
+	return err;
 }
 
 static inline unsigned int fib_laddr_hashfn(__be32 val)
 {
 	unsigned int mask = (fib_hash_size - 1);
 
-	return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
+	return ((__force u32)val ^
+		((__force u32)val >> 7) ^
+		((__force u32)val >> 14)) & mask;
 }
 
 static struct hlist_head *fib_hash_alloc(int bytes)
@@ -611,7 +630,8 @@ static struct hlist_head *fib_hash_alloc(int bytes)
 		return kzalloc(bytes, GFP_KERNEL);
 	else
 		return (struct hlist_head *)
-			__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
+			__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					 get_order(bytes));
 }
 
 static void fib_hash_free(struct hlist_head *hash, int bytes)
@@ -806,7 +826,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 			goto failure;
 	} else {
 		change_nexthops(fi) {
-			if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0)
+			err = fib_check_nh(cfg, fi, nexthop_nh);
+			if (err != 0)
 				goto failure;
 		} endfor_nexthops(fi)
 	}
@@ -819,7 +840,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 	}
 
 link_it:
-	if ((ofi = fib_find_info(fi)) != NULL) {
+	ofi = fib_find_info(fi);
+	if (ofi) {
 		fi->fib_dead = 1;
 		free_fib_info(fi);
 		ofi->fib_treeref++;
@@ -864,7 +886,7 @@ failure:
 
 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
-		       struct fib_result *res, int prefixlen)
+		       struct fib_result *res, int prefixlen, int fib_flags)
 {
 	struct fib_alias *fa;
 	int nh_sel = 0;
@@ -879,7 +901,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
 		if (fa->fa_scope < flp->fl4_scope)
 			continue;
 
-		fa->fa_state |= FA_S_ACCESSED;
+		fib_alias_accessed(fa);
 
 		err = fib_props[fa->fa_type].error;
 		if (err == 0) {
@@ -895,7 +917,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
 			case RTN_ANYCAST:
 			case RTN_MULTICAST:
 				for_nexthops(fi) {
-					if (nh->nh_flags&RTNH_F_DEAD)
+					if (nh->nh_flags & RTNH_F_DEAD)
 						continue;
 					if (!flp->oif || flp->oif == nh->nh_oif)
 						break;
@@ -906,16 +928,15 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
 					goto out_fill_res;
 				}
 #else
-				if (nhsel < 1) {
+				if (nhsel < 1)
 					goto out_fill_res;
-				}
 #endif
 				endfor_nexthops(fi);
 				continue;
 
 			default:
-				printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
-					fa->fa_type);
+				pr_warning("fib_semantic_match bad type %#x\n",
+					   fa->fa_type);
 				return -EINVAL;
 			}
 		}
@@ -929,7 +950,8 @@ out_fill_res:
 	res->type = fa->fa_type;
 	res->scope = fa->fa_scope;
 	res->fi = fa->fa_info;
-	atomic_inc(&res->fi->fib_clntref);
+	if (!(fib_flags & FIB_LOOKUP_NOREF))
+		atomic_inc(&res->fi->fib_clntref);
 	return 0;
 }
 
@@ -1028,10 +1050,10 @@ nla_put_failure:
 }
 
 /*
-   Update FIB if:
-   - local address disappeared -> we must delete all the entries
-     referring to it.
-   - device went down -> we must shutdown all nexthops going via it.
+ * Update FIB if:
+ * - local address disappeared -> we must delete all the entries
+ *   referring to it.
+ * - device went down -> we must shutdown all nexthops going via it.
  */
 int fib_sync_down_addr(struct net *net, __be32 local)
 {
@@ -1078,7 +1100,7 @@ int fib_sync_down_dev(struct net_device *dev, int force)
 		prev_fi = fi;
 		dead = 0;
 		change_nexthops(fi) {
-			if (nexthop_nh->nh_flags&RTNH_F_DEAD)
+			if (nexthop_nh->nh_flags & RTNH_F_DEAD)
 				dead++;
 			else if (nexthop_nh->nh_dev == dev &&
 				 nexthop_nh->nh_scope != scope) {
@@ -1110,10 +1132,9 @@ int fib_sync_down_dev(struct net_device *dev, int force)
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 
 /*
-   Dead device goes up. We wake up dead nexthops.
-   It takes sense only on multipath routes.
+ * Dead device goes up. We wake up dead nexthops.
+ * It takes sense only on multipath routes.
  */
-
 int fib_sync_up(struct net_device *dev)
 {
 	struct fib_info *prev_fi;
@@ -1123,7 +1144,7 @@ int fib_sync_up(struct net_device *dev)
 	struct fib_nh *nh;
 	int ret;
 
-	if (!(dev->flags&IFF_UP))
+	if (!(dev->flags & IFF_UP))
 		return 0;
 
 	prev_fi = NULL;
@@ -1142,12 +1163,12 @@ int fib_sync_up(struct net_device *dev)
 		prev_fi = fi;
 		alive = 0;
 		change_nexthops(fi) {
-			if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
+			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
 				alive++;
 				continue;
 			}
 			if (nexthop_nh->nh_dev == NULL ||
-			    !(nexthop_nh->nh_dev->flags&IFF_UP))
+			    !(nexthop_nh->nh_dev->flags & IFF_UP))
 				continue;
 			if (nexthop_nh->nh_dev != dev ||
 			    !__in_dev_get_rtnl(dev))
@@ -1169,10 +1190,9 @@ int fib_sync_up(struct net_device *dev)
 }
 
 /*
-   The algorithm is suboptimal, but it provides really
-   fair weighted route distribution.
+ * The algorithm is suboptimal, but it provides really
+ * fair weighted route distribution.
  */
-
 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
 {
 	struct fib_info *fi = res->fi;
@@ -1182,7 +1202,7 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
 	if (fi->fib_power <= 0) {
 		int power = 0;
 		change_nexthops(fi) {
-			if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
+			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
 				power += nexthop_nh->nh_weight;
 				nexthop_nh->nh_power = nexthop_nh->nh_weight;
 			}
@@ -1198,15 +1218,16 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
 
 
 	/* w should be random number [0..fi->fib_power-1],
-	   it is pretty bad approximation.
+	 * it is pretty bad approximation.
 	 */
 
 	w = jiffies % fi->fib_power;
 
 	change_nexthops(fi) {
-		if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) &&
+		if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
 		    nexthop_nh->nh_power) {
-			if ((w -= nexthop_nh->nh_power) <= 0) {
+			w -= nexthop_nh->nh_power;
+			if (w <= 0) {
 				nexthop_nh->nh_power--;
 				fi->fib_power--;
 				res->nh_sel = nhsel;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 79d057a939b..200eb538fbb 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -16,7 +16,7 @@
  *
  * An experimental study of compression methods for dynamic tries
  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+ * http://www.csc.kth.se/~snilsson/software/dyntrie2/
  *
  *
  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
@@ -186,7 +186,7 @@ static inline struct tnode *node_parent_rcu(struct node *node)
 {
 	struct tnode *ret = node_parent(node);
 
-	return rcu_dereference(ret);
+	return rcu_dereference_rtnl(ret);
 }
 
 /* Same as rcu_assign_pointer
@@ -209,9 +209,7 @@ static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
 {
 	struct node *ret = tnode_get_child(tn, i);
 
-	return rcu_dereference_check(ret,
-				     rcu_read_lock_held() ||
-				     lockdep_rtnl_is_held());
+	return rcu_dereference_rtnl(ret);
 }
 
 static inline int tnode_child_length(const struct tnode *tn)
@@ -457,8 +455,8 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
 		tn->empty_children = 1<<bits;
 	}
 
-	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
-		 (unsigned long) (sizeof(struct node) << bits));
+	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
+		 sizeof(struct node) << bits);
 	return tn;
 }
 
@@ -607,11 +605,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 
 	/* Keep root node larger  */
 
-	if (!node_parent((struct node*) tn)) {
+	if (!node_parent((struct node *)tn)) {
 		inflate_threshold_use = inflate_threshold_root;
 		halve_threshold_use = halve_threshold_root;
-	}
-	else {
+	} else {
 		inflate_threshold_use = inflate_threshold;
 		halve_threshold_use = halve_threshold;
 	}
@@ -637,7 +634,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	check_tnode(tn);
 
 	/* Return if at least one inflate is run */
-	if( max_work != MAX_WORK)
+	if (max_work != MAX_WORK)
 		return (struct node *) tn;
 
 	/*
@@ -964,9 +961,7 @@ fib_find_node(struct trie *t, u32 key)
 	struct node *n;
 
 	pos = 0;
-	n = rcu_dereference_check(t->trie,
-				  rcu_read_lock_held() ||
-				  lockdep_rtnl_is_held());
+	n = rcu_dereference_rtnl(t->trie);
 
 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 		tn = (struct tnode *) n;
@@ -1347,7 +1342,7 @@ err:
 /* should be called with rcu_read_lock */
 static int check_leaf(struct trie *t, struct leaf *l,
 		      t_key key,  const struct flowi *flp,
-		      struct fib_result *res)
+		      struct fib_result *res, int fib_flags)
 {
 	struct leaf_info *li;
 	struct hlist_head *hhead = &l->list;
@@ -1361,7 +1356,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
 		if (l->key != (key & ntohl(mask)))
 			continue;
 
-		err = fib_semantic_match(&li->falh, flp, res, plen);
+		err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags);
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 		if (err <= 0)
@@ -1377,7 +1372,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
 }
 
 int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
-		     struct fib_result *res)
+		     struct fib_result *res, int fib_flags)
 {
 	struct trie *t = (struct trie *) tb->tb_data;
 	int ret;
@@ -1389,8 +1384,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
 	t_key cindex = 0;
 	int current_prefix_length = KEYLENGTH;
 	struct tnode *cn;
-	t_key node_prefix, key_prefix, pref_mismatch;
-	int mp;
+	t_key pref_mismatch;
 
 	rcu_read_lock();
 
@@ -1404,7 +1398,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
 
 	/* Just a leaf? */
 	if (IS_LEAF(n)) {
-		ret = check_leaf(t, (struct leaf *)n, key, flp, res);
+		ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
 		goto found;
 	}
 
@@ -1429,7 +1423,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
 		}
 
 		if (IS_LEAF(n)) {
-			ret = check_leaf(t, (struct leaf *)n, key, flp, res);
+			ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
 			if (ret > 0)
 				goto backtrace;
 			goto found;
@@ -1505,10 +1499,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
 		 * matching prefix.
 		 */
 
-		node_prefix = mask_pfx(cn->key, cn->pos);
-		key_prefix = mask_pfx(key, cn->pos);
-		pref_mismatch = key_prefix^node_prefix;
-		mp = 0;
+		pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
 
 		/*
 		 * In short: If skipped bits in this node do not match
@@ -1516,13 +1507,9 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
 		 * state.directly.
 		 */
 		if (pref_mismatch) {
-			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
-				mp++;
-				pref_mismatch = pref_mismatch << 1;
-			}
-			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
+			int mp = KEYLENGTH - fls(pref_mismatch);
 
-			if (key_prefix != 0)
+			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
 				goto backtrace;
 
 			if (current_prefix_length >= cn->pos)
@@ -1746,14 +1733,14 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
 
 		/* Node empty, walk back up to parent */
 		c = (struct node *) p;
-	} while ( (p = node_parent_rcu(c)) != NULL);
+	} while ((p = node_parent_rcu(c)) != NULL);
 
 	return NULL; /* Root of trie */
 }
 
 static struct leaf *trie_firstleaf(struct trie *t)
 {
-	struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
+	struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
 
 	if (!n)
 		return NULL;
@@ -1810,6 +1797,11 @@ int fib_table_flush(struct fib_table *tb)
 	return found;
 }
 
+void fib_free_table(struct fib_table *tb)
+{
+	kfree(tb);
+}
+
 void fib_table_select_default(struct fib_table *tb,
 			      const struct flowi *flp,
 			      struct fib_result *res)
@@ -1851,7 +1843,8 @@ void fib_table_select_default(struct fib_table *tb,
 		if (!next_fi->fib_nh[0].nh_gw ||
 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
 			continue;
-		fa->fa_state |= FA_S_ACCESSED;
+
+		fib_alias_accessed(fa);
 
 		if (fi == NULL) {
 			if (next_fi != res->fi)
@@ -2039,14 +2032,14 @@ struct fib_trie_iter {
 	struct seq_net_private p;
 	struct fib_table *tb;
 	struct tnode *tnode;
-	unsigned index;
-	unsigned depth;
+	unsigned int index;
+	unsigned int depth;
 };
 
 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
 {
 	struct tnode *tn = iter->tnode;
-	unsigned cindex = iter->index;
+	unsigned int cindex = iter->index;
 	struct tnode *p;
 
 	/* A single entry routing table */
@@ -2155,7 +2148,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
  */
 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
 {
-	unsigned i, max, pointers, bytes, avdepth;
+	unsigned int i, max, pointers, bytes, avdepth;
 
 	if (stat->leaves)
 		avdepth = stat->totdepth*100 / stat->leaves;
@@ -2352,7 +2345,8 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v)
 
 static void seq_indent(struct seq_file *seq, int n)
 {
-	while (n-- > 0) seq_puts(seq, "   ");
+	while (n-- > 0)
+		seq_puts(seq, "   ");
 }
 
 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
@@ -2384,7 +2378,7 @@ static const char *const rtn_type_names[__RTN_MAX] = {
 	[RTN_XRESOLVE] = "XRESOLVE",
 };
 
-static inline const char *rtn_type(char *buf, size_t len, unsigned t)
+static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
 {
 	if (t < __RTN_MAX && rtn_type_names[t])
 		return rtn_type_names[t];
@@ -2540,13 +2534,12 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v)
 	rcu_read_unlock();
 }
 
-static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
 {
-	static unsigned type2flags[RTN_MAX + 1] = {
-		[7] = RTF_REJECT, [8] = RTF_REJECT,
-	};
-	unsigned flags = type2flags[type];
+	unsigned int flags = 0;
 
+	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
+		flags = RTF_REJECT;
 	if (fi && fi->fib_nh->nh_gw)
 		flags |= RTF_GATEWAY;
 	if (mask == htonl(0xFFFFFFFF))
@@ -2558,7 +2551,7 @@ static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
 /*
  *	This outputs /proc/net/route.
  *	The format of the file is not supposed to be changed
- * 	and needs to be same as fib_hash output to avoid breaking
+ *	and needs to be same as fib_hash output to avoid breaking
  *	legacy utilities
  */
 static int fib_route_seq_show(struct seq_file *seq, void *v)
@@ -2583,7 +2576,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
 
 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
 			const struct fib_info *fi = fa->fa_info;
-			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
+			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
 			int len;
 
 			if (fa->fa_type == RTN_BROADCAST
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
new file mode 100644
index 00000000000..c6933f2ea31
--- /dev/null
+++ b/net/ipv4/gre.c
@@ -0,0 +1,152 @@
+/*
+ *	GRE over IPv4 demultiplexer driver
+ *
+ *	Authors: Dmitry Kozlov (xeb@mail.ru)
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+
+
+static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
+static DEFINE_SPINLOCK(gre_proto_lock);
+
+int gre_add_protocol(const struct gre_protocol *proto, u8 version)
+{
+	if (version >= GREPROTO_MAX)
+		goto err_out;
+
+	spin_lock(&gre_proto_lock);
+	if (gre_proto[version])
+		goto err_out_unlock;
+
+	rcu_assign_pointer(gre_proto[version], proto);
+	spin_unlock(&gre_proto_lock);
+	return 0;
+
+err_out_unlock:
+	spin_unlock(&gre_proto_lock);
+err_out:
+	return -1;
+}
+EXPORT_SYMBOL_GPL(gre_add_protocol);
+
+int gre_del_protocol(const struct gre_protocol *proto, u8 version)
+{
+	if (version >= GREPROTO_MAX)
+		goto err_out;
+
+	spin_lock(&gre_proto_lock);
+	if (rcu_dereference_protected(gre_proto[version],
+			lockdep_is_held(&gre_proto_lock)) != proto)
+		goto err_out_unlock;
+	rcu_assign_pointer(gre_proto[version], NULL);
+	spin_unlock(&gre_proto_lock);
+	synchronize_rcu();
+	return 0;
+
+err_out_unlock:
+	spin_unlock(&gre_proto_lock);
+err_out:
+	return -1;
+}
+EXPORT_SYMBOL_GPL(gre_del_protocol);
+
+static int gre_rcv(struct sk_buff *skb)
+{
+	const struct gre_protocol *proto;
+	u8 ver;
+	int ret;
+
+	if (!pskb_may_pull(skb, 12))
+		goto drop;
+
+	ver = skb->data[1]&0x7f;
+	if (ver >= GREPROTO_MAX)
+		goto drop;
+
+	rcu_read_lock();
+	proto = rcu_dereference(gre_proto[ver]);
+	if (!proto || !proto->handler)
+		goto drop_unlock;
+	ret = proto->handler(skb);
+	rcu_read_unlock();
+	return ret;
+
+drop_unlock:
+	rcu_read_unlock();
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+static void gre_err(struct sk_buff *skb, u32 info)
+{
+	const struct gre_protocol *proto;
+	u8 ver;
+
+	if (!pskb_may_pull(skb, 12))
+		goto drop;
+
+	ver = skb->data[1]&0x7f;
+	if (ver >= GREPROTO_MAX)
+		goto drop;
+
+	rcu_read_lock();
+	proto = rcu_dereference(gre_proto[ver]);
+	if (!proto || !proto->err_handler)
+		goto drop_unlock;
+	proto->err_handler(skb, info);
+	rcu_read_unlock();
+	return;
+
+drop_unlock:
+	rcu_read_unlock();
+drop:
+	kfree_skb(skb);
+}
+
+static const struct net_protocol net_gre_protocol = {
+	.handler     = gre_rcv,
+	.err_handler = gre_err,
+	.netns_ok    = 1,
+};
+
+static int __init gre_init(void)
+{
+	pr_info("GRE over IPv4 demultiplexor driver");
+
+	if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
+		pr_err("gre: can't add protocol\n");
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static void __exit gre_exit(void)
+{
+	inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+}
+
+module_init(gre_init);
+module_exit(gre_exit);
+
+MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
+MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_LICENSE("GPL");
+
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index d65e9215bcd..96bc7f9475a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -181,6 +181,7 @@ const struct icmp_err icmp_err_convert[] = {
 		.fatal = 1,
 	},
 };
+EXPORT_SYMBOL(icmp_err_convert);
 
 /*
  *	ICMP control array. This specifies what to do with each ICMP.
@@ -267,11 +268,12 @@ int xrlim_allow(struct dst_entry *dst, int timeout)
 	dst->rate_tokens = token;
 	return rc;
 }
+EXPORT_SYMBOL(xrlim_allow);
 
 static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
 		int type, int code)
 {
-	struct dst_entry *dst = &rt->u.dst;
+	struct dst_entry *dst = &rt->dst;
 	int rc = 1;
 
 	if (type > NR_ICMP_TYPES)
@@ -327,7 +329,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
 	struct sock *sk;
 	struct sk_buff *skb;
 
-	sk = icmp_sk(dev_net((*rt)->u.dst.dev));
+	sk = icmp_sk(dev_net((*rt)->dst.dev));
 	if (ip_append_data(sk, icmp_glue_bits, icmp_param,
 			   icmp_param->data_len+icmp_param->head_len,
 			   icmp_param->head_len,
@@ -359,7 +361,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 {
 	struct ipcm_cookie ipc;
 	struct rtable *rt = skb_rtable(skb);
-	struct net *net = dev_net(rt->u.dst.dev);
+	struct net *net = dev_net(rt->dst.dev);
 	struct sock *sk;
 	struct inet_sock *inet;
 	__be32 daddr;
@@ -377,7 +379,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	inet->tos = ip_hdr(skb)->tos;
 	daddr = ipc.addr = rt->rt_src;
 	ipc.opt = NULL;
-	ipc.shtx.flags = 0;
+	ipc.tx_flags = 0;
 	if (icmp_param->replyopts.optlen) {
 		ipc.opt = &icmp_param->replyopts;
 		if (ipc.opt->srr)
@@ -427,7 +429,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 
 	if (!rt)
 		goto out;
-	net = dev_net(rt->u.dst.dev);
+	net = dev_net(rt->dst.dev);
 
 	/*
 	 *	Find the original header. It is expected to be valid, of course.
@@ -536,7 +538,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	inet_sk(sk)->tos = tos;
 	ipc.addr = iph->saddr;
 	ipc.opt = &icmp_param.replyopts;
-	ipc.shtx.flags = 0;
+	ipc.tx_flags = 0;
 
 	{
 		struct flowi fl = {
@@ -596,9 +598,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 			/* Ugh! */
 			orefdst = skb_in->_skb_refdst; /* save old refdst */
 			err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
-					     RT_TOS(tos), rt2->u.dst.dev);
+					     RT_TOS(tos), rt2->dst.dev);
 
-			dst_release(&rt2->u.dst);
+			dst_release(&rt2->dst);
 			rt2 = skb_rtable(skb_in);
 			skb_in->_skb_refdst = orefdst; /* restore old refdst */
 		}
@@ -610,7 +612,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 				  XFRM_LOOKUP_ICMP);
 		switch (err) {
 		case 0:
-			dst_release(&rt->u.dst);
+			dst_release(&rt->dst);
 			rt = rt2;
 			break;
 		case -EPERM:
@@ -629,7 +631,7 @@ route_done:
 
 	/* RFC says return as much as we can without exceeding 576 bytes. */
 
-	room = dst_mtu(&rt->u.dst);
+	room = dst_mtu(&rt->dst);
 	if (room > 576)
 		room = 576;
 	room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
@@ -647,6 +649,7 @@ out_unlock:
 	icmp_xmit_unlock(sk);
 out:;
 }
+EXPORT_SYMBOL(icmp_send);
 
 
 /*
@@ -925,6 +928,7 @@ static void icmp_address(struct sk_buff *skb)
 /*
  * RFC1812 (4.3.3.9).	A router SHOULD listen all replies, and complain
  *			loudly if an inconsistency is found.
+ * called with rcu_read_lock()
  */
 
 static void icmp_address_reply(struct sk_buff *skb)
@@ -935,12 +939,12 @@ static void icmp_address_reply(struct sk_buff *skb)
 	struct in_ifaddr *ifa;
 
 	if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
-		goto out;
+		return;
 
-	in_dev = in_dev_get(dev);
+	in_dev = __in_dev_get_rcu(dev);
 	if (!in_dev)
-		goto out;
-	rcu_read_lock();
+		return;
+
 	if (in_dev->ifa_list &&
 	    IN_DEV_LOG_MARTIANS(in_dev) &&
 	    IN_DEV_FORWARD(in_dev)) {
@@ -958,9 +962,6 @@ static void icmp_address_reply(struct sk_buff *skb)
 			       mp, dev->name, &rt->rt_src);
 		}
 	}
-	rcu_read_unlock();
-	in_dev_put(in_dev);
-out:;
 }
 
 static void icmp_discard(struct sk_buff *skb)
@@ -974,7 +975,7 @@ int icmp_rcv(struct sk_buff *skb)
 {
 	struct icmphdr *icmph;
 	struct rtable *rt = skb_rtable(skb);
-	struct net *net = dev_net(rt->u.dst.dev);
+	struct net *net = dev_net(rt->dst.dev);
 
 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
 		struct sec_path *sp = skb_sec_path(skb);
@@ -1216,7 +1217,3 @@ int __init icmp_init(void)
 {
 	return register_pernet_subsys(&icmp_sk_ops);
 }
-
-EXPORT_SYMBOL(icmp_err_convert);
-EXPORT_SYMBOL(icmp_send);
-EXPORT_SYMBOL(xrlim_allow);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 5fff865a4fa..c8877c6c721 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -312,7 +312,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
 		return NULL;
 	}
 
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 	skb->dev = dev;
 
 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
@@ -330,7 +330,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
 	pip->saddr    = rt->rt_src;
 	pip->protocol = IPPROTO_IGMP;
 	pip->tot_len  = 0;	/* filled in later */
-	ip_select_ident(pip, &rt->u.dst, NULL);
+	ip_select_ident(pip, &rt->dst, NULL);
 	((u8*)&pip[1])[0] = IPOPT_RA;
 	((u8*)&pip[1])[1] = 4;
 	((u8*)&pip[1])[2] = 0;
@@ -660,7 +660,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
 		return -1;
 	}
 
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 
 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
 
@@ -676,7 +676,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
 	iph->daddr    = dst;
 	iph->saddr    = rt->rt_src;
 	iph->protocol = IPPROTO_IGMP;
-	ip_select_ident(iph, &rt->u.dst, NULL);
+	ip_select_ident(iph, &rt->dst, NULL);
 	((u8*)&iph[1])[0] = IPOPT_RA;
 	((u8*)&iph[1])[1] = 4;
 	((u8*)&iph[1])[2] = 0;
@@ -856,6 +856,18 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
 		igmpv3_clear_delrec(in_dev);
 	} else if (len < 12) {
 		return;	/* ignore bogus packet; freed by caller */
+	} else if (IGMP_V1_SEEN(in_dev)) {
+		/* This is a v3 query with v1 queriers present */
+		max_delay = IGMP_Query_Response_Interval;
+		group = 0;
+	} else if (IGMP_V2_SEEN(in_dev)) {
+		/* this is a v3 query with v2 queriers present;
+		 * Interpretation of the max_delay code is problematic here.
+		 * A real v2 host would use ih_code directly, while v3 has a
+		 * different encoding. We use the v3 encoding as more likely
+		 * to be intended in a v3 query.
+		 */
+		max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
 	} else { /* v3 */
 		if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
 			return;
@@ -916,18 +928,19 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
 	read_unlock(&in_dev->mc_list_lock);
 }
 
+/* called in rcu_read_lock() section */
 int igmp_rcv(struct sk_buff *skb)
 {
 	/* This basically follows the spec line by line -- see RFC1112 */
 	struct igmphdr *ih;
-	struct in_device *in_dev = in_dev_get(skb->dev);
+	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 	int len = skb->len;
 
 	if (in_dev == NULL)
 		goto drop;
 
 	if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
-		goto drop_ref;
+		goto drop;
 
 	switch (skb->ip_summed) {
 	case CHECKSUM_COMPLETE:
@@ -937,7 +950,7 @@ int igmp_rcv(struct sk_buff *skb)
 	case CHECKSUM_NONE:
 		skb->csum = 0;
 		if (__skb_checksum_complete(skb))
-			goto drop_ref;
+			goto drop;
 	}
 
 	ih = igmp_hdr(skb);
@@ -957,7 +970,6 @@ int igmp_rcv(struct sk_buff *skb)
 		break;
 	case IGMP_PIM:
 #ifdef CONFIG_IP_PIMSM_V1
-		in_dev_put(in_dev);
 		return pim_rcv_v1(skb);
 #endif
 	case IGMPV3_HOST_MEMBERSHIP_REPORT:
@@ -971,8 +983,6 @@ int igmp_rcv(struct sk_buff *skb)
 		break;
 	}
 
-drop_ref:
-	in_dev_put(in_dev);
 drop:
 	kfree_skb(skb);
 	return 0;
@@ -1246,6 +1256,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 out:
 	return;
 }
+EXPORT_SYMBOL(ip_mc_inc_group);
 
 /*
  *	Resend IGMP JOIN report; used for bonding.
@@ -1258,16 +1269,17 @@ void ip_mc_rejoin_group(struct ip_mc_list *im)
 	if (im->multiaddr == IGMP_ALL_HOSTS)
 		return;
 
-	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
-		igmp_mod_timer(im, IGMP_Initial_Report_Delay);
-		return;
-	}
-	/* else, v3 */
-	im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
-		IGMP_Unsolicited_Report_Count;
-	igmp_ifc_event(in_dev);
+	/* a failover is happening and switches
+	 * must be notified immediately */
+	if (IGMP_V1_SEEN(in_dev))
+		igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
+	else if (IGMP_V2_SEEN(in_dev))
+		igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
+	else
+		igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
 #endif
 }
+EXPORT_SYMBOL(ip_mc_rejoin_group);
 
 /*
  *	A socket has left a multicast group on device dev
@@ -1298,6 +1310,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
 		}
 	}
 }
+EXPORT_SYMBOL(ip_mc_dec_group);
 
 /* Device changing type */
 
@@ -1405,6 +1418,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
 	write_unlock_bh(&in_dev->mc_list_lock);
 }
 
+/* RTNL is locked */
 static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
 {
 	struct flowi fl = { .nl_u = { .ip4_u =
@@ -1415,19 +1429,16 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
 
 	if (imr->imr_ifindex) {
 		idev = inetdev_by_index(net, imr->imr_ifindex);
-		if (idev)
-			__in_dev_put(idev);
 		return idev;
 	}
 	if (imr->imr_address.s_addr) {
-		dev = ip_dev_find(net, imr->imr_address.s_addr);
+		dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
 		if (!dev)
 			return NULL;
-		dev_put(dev);
 	}
 
 	if (!dev && !ip_route_output_key(net, &rt, &fl)) {
-		dev = rt->u.dst.dev;
+		dev = rt->dst.dev;
 		ip_rt_put(rt);
 	}
 	if (dev) {
@@ -1646,8 +1657,7 @@ static int sf_setstate(struct ip_mc_list *pmc)
 				if (dpsf->sf_inaddr == psf->sf_inaddr)
 					break;
 			if (!dpsf) {
-				dpsf = (struct ip_sf_list *)
-					kmalloc(sizeof(*dpsf), GFP_ATOMIC);
+				dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
 				if (!dpsf)
 					continue;
 				*dpsf = *psf;
@@ -1807,6 +1817,7 @@ done:
 	rtnl_unlock();
 	return err;
 }
+EXPORT_SYMBOL(ip_mc_join_group);
 
 static void ip_sf_socklist_reclaim(struct rcu_head *rp)
 {
@@ -2679,8 +2690,3 @@ int __init igmp_mc_proc_init(void)
 	return register_pernet_subsys(&igmp_net_ops);
 }
 #endif
-
-EXPORT_SYMBOL(ip_mc_dec_group);
-EXPORT_SYMBOL(ip_mc_inc_group);
-EXPORT_SYMBOL(ip_mc_join_group);
-EXPORT_SYMBOL(ip_mc_rejoin_group);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 70eb3507c40..7174370b119 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -84,7 +84,6 @@ int inet_csk_bind_conflict(const struct sock *sk,
 	}
 	return node != NULL;
 }
-
 EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
 
 /* Obtain a reference to a local port for the given sock,
@@ -212,7 +211,6 @@ fail:
 	local_bh_enable();
 	return ret;
 }
-
 EXPORT_SYMBOL_GPL(inet_csk_get_port);
 
 /*
@@ -305,7 +303,6 @@ out_err:
 	*err = error;
 	goto out;
 }
-
 EXPORT_SYMBOL(inet_csk_accept);
 
 /*
@@ -327,7 +324,6 @@ void inet_csk_init_xmit_timers(struct sock *sk,
 	setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
 	icsk->icsk_pending = icsk->icsk_ack.pending = 0;
 }
-
 EXPORT_SYMBOL(inet_csk_init_xmit_timers);
 
 void inet_csk_clear_xmit_timers(struct sock *sk)
@@ -340,21 +336,18 @@ void inet_csk_clear_xmit_timers(struct sock *sk)
 	sk_stop_timer(sk, &icsk->icsk_delack_timer);
 	sk_stop_timer(sk, &sk->sk_timer);
 }
-
 EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
 
 void inet_csk_delete_keepalive_timer(struct sock *sk)
 {
 	sk_stop_timer(sk, &sk->sk_timer);
 }
-
 EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
 
 void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
 {
 	sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
 }
-
 EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
 
 struct dst_entry *inet_csk_route_req(struct sock *sk,
@@ -383,7 +376,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
 		goto no_route;
 	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 		goto route_err;
-	return &rt->u.dst;
+	return &rt->dst;
 
 route_err:
 	ip_rt_put(rt);
@@ -391,7 +384,6 @@ no_route:
 	IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
 	return NULL;
 }
-
 EXPORT_SYMBOL_GPL(inet_csk_route_req);
 
 static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
@@ -433,7 +425,6 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
 
 	return req;
 }
-
 EXPORT_SYMBOL_GPL(inet_csk_search_req);
 
 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
@@ -447,11 +438,11 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
 	reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
 	inet_csk_reqsk_queue_added(sk, timeout);
 }
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
 
 /* Only thing we need from tcp.h */
 extern int sysctl_tcp_synack_retries;
 
-EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
 
 /* Decide when to expire the request and when to resend SYN-ACK */
 static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
@@ -569,7 +560,6 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
 	if (lopt->qlen)
 		inet_csk_reset_keepalive_timer(parent, interval);
 }
-
 EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
 
 struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
@@ -599,7 +589,6 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
 	}
 	return newsk;
 }
-
 EXPORT_SYMBOL_GPL(inet_csk_clone);
 
 /*
@@ -630,7 +619,6 @@ void inet_csk_destroy_sock(struct sock *sk)
 	percpu_counter_dec(sk->sk_prot->orphan_count);
 	sock_put(sk);
 }
-
 EXPORT_SYMBOL(inet_csk_destroy_sock);
 
 int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
@@ -665,7 +653,6 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
 	__reqsk_queue_destroy(&icsk->icsk_accept_queue);
 	return -EADDRINUSE;
 }
-
 EXPORT_SYMBOL_GPL(inet_csk_listen_start);
 
 /*
@@ -720,7 +707,6 @@ void inet_csk_listen_stop(struct sock *sk)
 	}
 	WARN_ON(sk->sk_ack_backlog);
 }
-
 EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
 
 void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
@@ -732,7 +718,6 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
 	sin->sin_addr.s_addr	= inet->inet_daddr;
 	sin->sin_port		= inet->inet_dport;
 }
-
 EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
 
 #ifdef CONFIG_COMPAT
@@ -747,7 +732,6 @@ int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
 	return icsk->icsk_af_ops->getsockopt(sk, level, optname,
 					     optval, optlen);
 }
-
 EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
 
 int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
@@ -761,6 +745,5 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
 	return icsk->icsk_af_ops->setsockopt(sk, level, optname,
 					     optval, optlen);
 }
-
 EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
 #endif
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e5fa2ddce32..ba804266584 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -425,7 +425,7 @@ static int inet_diag_bc_run(const void *bc, int len,
 			bc += op->no;
 		}
 	}
-	return (len == 0);
+	return len == 0;
 }
 
 static int valid_cc(const void *bc, int len, int cc)
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index a2ca6aed763..5ff2a51b6d0 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -114,7 +114,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
 		fq->last_in |= INET_FRAG_COMPLETE;
 	}
 }
-
 EXPORT_SYMBOL(inet_frag_kill);
 
 static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index d3e160a8821..1b344f30b46 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -99,24 +99,46 @@ void inet_put_port(struct sock *sk)
 	__inet_put_port(sk);
 	local_bh_enable();
 }
-
 EXPORT_SYMBOL(inet_put_port);
 
-void __inet_inherit_port(struct sock *sk, struct sock *child)
+int __inet_inherit_port(struct sock *sk, struct sock *child)
 {
 	struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
-	const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num,
+	unsigned short port = inet_sk(child)->inet_num;
+	const int bhash = inet_bhashfn(sock_net(sk), port,
 			table->bhash_size);
 	struct inet_bind_hashbucket *head = &table->bhash[bhash];
 	struct inet_bind_bucket *tb;
 
 	spin_lock(&head->lock);
 	tb = inet_csk(sk)->icsk_bind_hash;
+	if (tb->port != port) {
+		/* NOTE: using tproxy and redirecting skbs to a proxy
+		 * on a different listener port breaks the assumption
+		 * that the listener socket's icsk_bind_hash is the same
+		 * as that of the child socket. We have to look up or
+		 * create a new bind bucket for the child here. */
+		struct hlist_node *node;
+		inet_bind_bucket_for_each(tb, node, &head->chain) {
+			if (net_eq(ib_net(tb), sock_net(sk)) &&
+			    tb->port == port)
+				break;
+		}
+		if (!node) {
+			tb = inet_bind_bucket_create(table->bind_bucket_cachep,
+						     sock_net(sk), head, port);
+			if (!tb) {
+				spin_unlock(&head->lock);
+				return -ENOMEM;
+			}
+		}
+	}
 	sk_add_bind_node(child, &tb->owners);
 	inet_csk(child)->icsk_bind_hash = tb;
 	spin_unlock(&head->lock);
-}
 
+	return 0;
+}
 EXPORT_SYMBOL_GPL(__inet_inherit_port);
 
 static inline int compute_score(struct sock *sk, struct net *net,
@@ -546,7 +568,6 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
 	return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
 			__inet_check_established, __inet_hash_nolisten);
 }
-
 EXPORT_SYMBOL_GPL(inet_hash_connect);
 
 void inet_hashinfo_init(struct inet_hashinfo *h)
@@ -560,5 +581,4 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
 				      i + LISTENING_NULLS_BASE);
 		}
 }
-
 EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 6bcfe52a9c8..9e94d7cf4f8 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -51,8 +51,8 @@
  *  lookups performed with disabled BHs.
  *
  *  Serialisation issues.
- *  1.  Nodes may appear in the tree only with the pool write lock held.
- *  2.  Nodes may disappear from the tree only with the pool write lock held
+ *  1.  Nodes may appear in the tree only with the pool lock held.
+ *  2.  Nodes may disappear from the tree only with the pool lock held
  *      AND reference count being 0.
  *  3.  Nodes appears and disappears from unused node list only under
  *      "inet_peer_unused_lock".
@@ -64,23 +64,32 @@
  *		   usually under some other lock to prevent node disappearing
  *		dtime: unused node list lock
  *		v4daddr: unchangeable
- *		ip_id_count: idlock
+ *		ip_id_count: atomic value (no lock needed)
  */
 
 static struct kmem_cache *peer_cachep __read_mostly;
 
 #define node_height(x) x->avl_height
-static struct inet_peer peer_fake_node = {
-	.avl_left	= &peer_fake_node,
-	.avl_right	= &peer_fake_node,
+
+#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
+#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
+static const struct inet_peer peer_fake_node = {
+	.avl_left	= peer_avl_empty_rcu,
+	.avl_right	= peer_avl_empty_rcu,
 	.avl_height	= 0
 };
-#define peer_avl_empty (&peer_fake_node)
-static struct inet_peer *peer_root = peer_avl_empty;
-static DEFINE_RWLOCK(peer_pool_lock);
+
+static struct {
+	struct inet_peer __rcu *root;
+	spinlock_t	lock;
+	int		total;
+} peers = {
+	.root		= peer_avl_empty_rcu,
+	.lock		= __SPIN_LOCK_UNLOCKED(peers.lock),
+	.total		= 0,
+};
 #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
 
-static int peer_total;
 /* Exported for sysctl_net_ipv4.  */
 int inet_peer_threshold __read_mostly = 65536 + 128;	/* start to throw entries more
 					 * aggressively at this stage */
@@ -89,8 +98,13 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ;	/* usual time to live: 10 min
 int inet_peer_gc_mintime __read_mostly = 10 * HZ;
 int inet_peer_gc_maxtime __read_mostly = 120 * HZ;
 
-static LIST_HEAD(unused_peers);
-static DEFINE_SPINLOCK(inet_peer_unused_lock);
+static struct {
+	struct list_head	list;
+	spinlock_t		lock;
+} unused_peers = {
+	.list			= LIST_HEAD_INIT(unused_peers.list),
+	.lock			= __SPIN_LOCK_UNLOCKED(unused_peers.lock),
+};
 
 static void peer_check_expire(unsigned long dummy);
 static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0);
@@ -116,7 +130,7 @@ void __init inet_initpeers(void)
 
 	peer_cachep = kmem_cache_create("inet_peer_cache",
 			sizeof(struct inet_peer),
-			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
 			NULL);
 
 	/* All the timers, started at system startup tend
@@ -131,122 +145,174 @@ void __init inet_initpeers(void)
 /* Called with or without local BH being disabled. */
 static void unlink_from_unused(struct inet_peer *p)
 {
-	spin_lock_bh(&inet_peer_unused_lock);
-	list_del_init(&p->unused);
-	spin_unlock_bh(&inet_peer_unused_lock);
+	if (!list_empty(&p->unused)) {
+		spin_lock_bh(&unused_peers.lock);
+		list_del_init(&p->unused);
+		spin_unlock_bh(&unused_peers.lock);
+	}
 }
 
 /*
  * Called with local BH disabled and the pool lock held.
- * _stack is known to be NULL or not at compile time,
- * so compiler will optimize the if (_stack) tests.
  */
 #define lookup(_daddr, _stack) 					\
 ({								\
-	struct inet_peer *u, **v;				\
-	if (_stack != NULL) {					\
-		stackptr = _stack;				\
-		*stackptr++ = &peer_root;			\
-	}							\
-	for (u = peer_root; u != peer_avl_empty; ) {		\
+	struct inet_peer *u;					\
+	struct inet_peer __rcu **v;				\
+								\
+	stackptr = _stack;					\
+	*stackptr++ = &peers.root;				\
+	for (u = rcu_dereference_protected(peers.root,		\
+			lockdep_is_held(&peers.lock));		\
+	     u != peer_avl_empty; ) {				\
 		if (_daddr == u->v4daddr)			\
 			break;					\
 		if ((__force __u32)_daddr < (__force __u32)u->v4daddr)	\
 			v = &u->avl_left;			\
 		else						\
 			v = &u->avl_right;			\
-		if (_stack != NULL)				\
-			*stackptr++ = v;			\
-		u = *v;						\
+		*stackptr++ = v;				\
+		u = rcu_dereference_protected(*v,		\
+			lockdep_is_held(&peers.lock));		\
 	}							\
 	u;							\
 })
 
-/* Called with local BH disabled and the pool write lock held. */
+/*
+ * Called with rcu_read_lock_bh()
+ * Because we hold no lock against a writer, its quite possible we fall
+ * in an endless loop.
+ * But every pointer we follow is guaranteed to be valid thanks to RCU.
+ * We exit from this function if number of links exceeds PEER_MAXDEPTH
+ */
+static struct inet_peer *lookup_rcu_bh(__be32 daddr)
+{
+	struct inet_peer *u = rcu_dereference_bh(peers.root);
+	int count = 0;
+
+	while (u != peer_avl_empty) {
+		if (daddr == u->v4daddr) {
+			/* Before taking a reference, check if this entry was
+			 * deleted, unlink_from_pool() sets refcnt=-1 to make
+			 * distinction between an unused entry (refcnt=0) and
+			 * a freed one.
+			 */
+			if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1)))
+				u = NULL;
+			return u;
+		}
+		if ((__force __u32)daddr < (__force __u32)u->v4daddr)
+			u = rcu_dereference_bh(u->avl_left);
+		else
+			u = rcu_dereference_bh(u->avl_right);
+		if (unlikely(++count == PEER_MAXDEPTH))
+			break;
+	}
+	return NULL;
+}
+
+/* Called with local BH disabled and the pool lock held. */
 #define lookup_rightempty(start)				\
 ({								\
-	struct inet_peer *u, **v;				\
+	struct inet_peer *u;					\
+	struct inet_peer __rcu **v;				\
 	*stackptr++ = &start->avl_left;				\
 	v = &start->avl_left;					\
-	for (u = *v; u->avl_right != peer_avl_empty; ) {	\
+	for (u = rcu_dereference_protected(*v,			\
+			lockdep_is_held(&peers.lock));		\
+	     u->avl_right != peer_avl_empty_rcu; ) {		\
 		v = &u->avl_right;				\
 		*stackptr++ = v;				\
-		u = *v;						\
+		u = rcu_dereference_protected(*v,		\
+			lockdep_is_held(&peers.lock));		\
 	}							\
 	u;							\
 })
 
-/* Called with local BH disabled and the pool write lock held.
+/* Called with local BH disabled and the pool lock held.
  * Variable names are the proof of operation correctness.
- * Look into mm/map_avl.c for more detail description of the ideas.  */
-static void peer_avl_rebalance(struct inet_peer **stack[],
-		struct inet_peer ***stackend)
+ * Look into mm/map_avl.c for more detail description of the ideas.
+ */
+static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
+		struct inet_peer __rcu ***stackend)
 {
-	struct inet_peer **nodep, *node, *l, *r;
+	struct inet_peer __rcu **nodep;
+	struct inet_peer *node, *l, *r;
 	int lh, rh;
 
 	while (stackend > stack) {
 		nodep = *--stackend;
-		node = *nodep;
-		l = node->avl_left;
-		r = node->avl_right;
+		node = rcu_dereference_protected(*nodep,
+				lockdep_is_held(&peers.lock));
+		l = rcu_dereference_protected(node->avl_left,
+				lockdep_is_held(&peers.lock));
+		r = rcu_dereference_protected(node->avl_right,
+				lockdep_is_held(&peers.lock));
 		lh = node_height(l);
 		rh = node_height(r);
 		if (lh > rh + 1) { /* l: RH+2 */
 			struct inet_peer *ll, *lr, *lrl, *lrr;
 			int lrh;
-			ll = l->avl_left;
-			lr = l->avl_right;
+			ll = rcu_dereference_protected(l->avl_left,
+				lockdep_is_held(&peers.lock));
+			lr = rcu_dereference_protected(l->avl_right,
+				lockdep_is_held(&peers.lock));
 			lrh = node_height(lr);
 			if (lrh <= node_height(ll)) {	/* ll: RH+1 */
-				node->avl_left = lr;	/* lr: RH or RH+1 */
-				node->avl_right = r;	/* r: RH */
+				RCU_INIT_POINTER(node->avl_left, lr);	/* lr: RH or RH+1 */
+				RCU_INIT_POINTER(node->avl_right, r);	/* r: RH */
 				node->avl_height = lrh + 1; /* RH+1 or RH+2 */
-				l->avl_left = ll;	/* ll: RH+1 */
-				l->avl_right = node;	/* node: RH+1 or RH+2 */
+				RCU_INIT_POINTER(l->avl_left, ll);       /* ll: RH+1 */
+				RCU_INIT_POINTER(l->avl_right, node);	/* node: RH+1 or RH+2 */
 				l->avl_height = node->avl_height + 1;
-				*nodep = l;
+				RCU_INIT_POINTER(*nodep, l);
 			} else { /* ll: RH, lr: RH+1 */
-				lrl = lr->avl_left;	/* lrl: RH or RH-1 */
-				lrr = lr->avl_right;	/* lrr: RH or RH-1 */
-				node->avl_left = lrr;	/* lrr: RH or RH-1 */
-				node->avl_right = r;	/* r: RH */
+				lrl = rcu_dereference_protected(lr->avl_left,
+					lockdep_is_held(&peers.lock));	/* lrl: RH or RH-1 */
+				lrr = rcu_dereference_protected(lr->avl_right,
+					lockdep_is_held(&peers.lock));	/* lrr: RH or RH-1 */
+				RCU_INIT_POINTER(node->avl_left, lrr);	/* lrr: RH or RH-1 */
+				RCU_INIT_POINTER(node->avl_right, r);	/* r: RH */
 				node->avl_height = rh + 1; /* node: RH+1 */
-				l->avl_left = ll;	/* ll: RH */
-				l->avl_right = lrl;	/* lrl: RH or RH-1 */
+				RCU_INIT_POINTER(l->avl_left, ll);	/* ll: RH */
+				RCU_INIT_POINTER(l->avl_right, lrl);	/* lrl: RH or RH-1 */
 				l->avl_height = rh + 1;	/* l: RH+1 */
-				lr->avl_left = l;	/* l: RH+1 */
-				lr->avl_right = node;	/* node: RH+1 */
+				RCU_INIT_POINTER(lr->avl_left, l);	/* l: RH+1 */
+				RCU_INIT_POINTER(lr->avl_right, node);	/* node: RH+1 */
 				lr->avl_height = rh + 2;
-				*nodep = lr;
+				RCU_INIT_POINTER(*nodep, lr);
 			}
 		} else if (rh > lh + 1) { /* r: LH+2 */
 			struct inet_peer *rr, *rl, *rlr, *rll;
 			int rlh;
-			rr = r->avl_right;
-			rl = r->avl_left;
+			rr = rcu_dereference_protected(r->avl_right,
+				lockdep_is_held(&peers.lock));
+			rl = rcu_dereference_protected(r->avl_left,
+				lockdep_is_held(&peers.lock));
 			rlh = node_height(rl);
 			if (rlh <= node_height(rr)) {	/* rr: LH+1 */
-				node->avl_right = rl;	/* rl: LH or LH+1 */
-				node->avl_left = l;	/* l: LH */
+				RCU_INIT_POINTER(node->avl_right, rl);	/* rl: LH or LH+1 */
+				RCU_INIT_POINTER(node->avl_left, l);	/* l: LH */
 				node->avl_height = rlh + 1; /* LH+1 or LH+2 */
-				r->avl_right = rr;	/* rr: LH+1 */
-				r->avl_left = node;	/* node: LH+1 or LH+2 */
+				RCU_INIT_POINTER(r->avl_right, rr);	/* rr: LH+1 */
+				RCU_INIT_POINTER(r->avl_left, node);	/* node: LH+1 or LH+2 */
 				r->avl_height = node->avl_height + 1;
-				*nodep = r;
+				RCU_INIT_POINTER(*nodep, r);
 			} else { /* rr: RH, rl: RH+1 */
-				rlr = rl->avl_right;	/* rlr: LH or LH-1 */
-				rll = rl->avl_left;	/* rll: LH or LH-1 */
-				node->avl_right = rll;	/* rll: LH or LH-1 */
-				node->avl_left = l;	/* l: LH */
+				rlr = rcu_dereference_protected(rl->avl_right,
+					lockdep_is_held(&peers.lock));	/* rlr: LH or LH-1 */
+				rll = rcu_dereference_protected(rl->avl_left,
+					lockdep_is_held(&peers.lock));	/* rll: LH or LH-1 */
+				RCU_INIT_POINTER(node->avl_right, rll);	/* rll: LH or LH-1 */
+				RCU_INIT_POINTER(node->avl_left, l);	/* l: LH */
 				node->avl_height = lh + 1; /* node: LH+1 */
-				r->avl_right = rr;	/* rr: LH */
-				r->avl_left = rlr;	/* rlr: LH or LH-1 */
+				RCU_INIT_POINTER(r->avl_right, rr);	/* rr: LH */
+				RCU_INIT_POINTER(r->avl_left, rlr);	/* rlr: LH or LH-1 */
 				r->avl_height = lh + 1;	/* r: LH+1 */
-				rl->avl_right = r;	/* r: LH+1 */
-				rl->avl_left = node;	/* node: LH+1 */
+				RCU_INIT_POINTER(rl->avl_right, r);	/* r: LH+1 */
+				RCU_INIT_POINTER(rl->avl_left, node);	/* node: LH+1 */
 				rl->avl_height = lh + 2;
-				*nodep = rl;
+				RCU_INIT_POINTER(*nodep, rl);
 			}
 		} else {
 			node->avl_height = (lh > rh ? lh : rh) + 1;
@@ -254,15 +320,21 @@ static void peer_avl_rebalance(struct inet_peer **stack[],
 	}
 }
 
-/* Called with local BH disabled and the pool write lock held. */
+/* Called with local BH disabled and the pool lock held. */
 #define link_to_pool(n)						\
 do {								\
 	n->avl_height = 1;					\
-	n->avl_left = peer_avl_empty;				\
-	n->avl_right = peer_avl_empty;				\
-	**--stackptr = n;					\
+	n->avl_left = peer_avl_empty_rcu;			\
+	n->avl_right = peer_avl_empty_rcu;			\
+	/* lockless readers can catch us now */			\
+	rcu_assign_pointer(**--stackptr, n);			\
 	peer_avl_rebalance(stack, stackptr);			\
-} while(0)
+} while (0)
+
+static void inetpeer_free_rcu(struct rcu_head *head)
+{
+	kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
+}
 
 /* May be called with local BH enabled. */
 static void unlink_from_pool(struct inet_peer *p)
@@ -271,31 +343,33 @@ static void unlink_from_pool(struct inet_peer *p)
 
 	do_free = 0;
 
-	write_lock_bh(&peer_pool_lock);
+	spin_lock_bh(&peers.lock);
 	/* Check the reference counter.  It was artificially incremented by 1
-	 * in cleanup() function to prevent sudden disappearing.  If the
-	 * reference count is still 1 then the node is referenced only as `p'
-	 * here and from the pool.  So under the exclusive pool lock it's safe
-	 * to remove the node and free it later. */
-	if (atomic_read(&p->refcnt) == 1) {
-		struct inet_peer **stack[PEER_MAXDEPTH];
-		struct inet_peer ***stackptr, ***delp;
+	 * in cleanup() function to prevent sudden disappearing.  If we can
+	 * atomically (because of lockless readers) take this last reference,
+	 * it's safe to remove the node and free it later.
+	 * We use refcnt=-1 to alert lockless readers this entry is deleted.
+	 */
+	if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
+		struct inet_peer __rcu **stack[PEER_MAXDEPTH];
+		struct inet_peer __rcu ***stackptr, ***delp;
 		if (lookup(p->v4daddr, stack) != p)
 			BUG();
 		delp = stackptr - 1; /* *delp[0] == p */
-		if (p->avl_left == peer_avl_empty) {
+		if (p->avl_left == peer_avl_empty_rcu) {
 			*delp[0] = p->avl_right;
 			--stackptr;
 		} else {
 			/* look for a node to insert instead of p */
 			struct inet_peer *t;
 			t = lookup_rightempty(p);
-			BUG_ON(*stackptr[-1] != t);
+			BUG_ON(rcu_dereference_protected(*stackptr[-1],
+					lockdep_is_held(&peers.lock)) != t);
 			**--stackptr = t->avl_left;
 			/* t is removed, t->v4daddr > x->v4daddr for any
 			 * x in p->avl_left subtree.
 			 * Put t in the old place of p. */
-			*delp[0] = t;
+			RCU_INIT_POINTER(*delp[0], t);
 			t->avl_left = p->avl_left;
 			t->avl_right = p->avl_right;
 			t->avl_height = p->avl_height;
@@ -303,20 +377,21 @@ static void unlink_from_pool(struct inet_peer *p)
 			delp[1] = &t->avl_left; /* was &p->avl_left */
 		}
 		peer_avl_rebalance(stack, stackptr);
-		peer_total--;
+		peers.total--;
 		do_free = 1;
 	}
-	write_unlock_bh(&peer_pool_lock);
+	spin_unlock_bh(&peers.lock);
 
 	if (do_free)
-		kmem_cache_free(peer_cachep, p);
+		call_rcu_bh(&p->rcu, inetpeer_free_rcu);
 	else
 		/* The node is used again.  Decrease the reference counter
 		 * back.  The loop "cleanup -> unlink_from_unused
 		 *   -> unlink_from_pool -> putpeer -> link_to_unused
 		 *   -> cleanup (for the same node)"
 		 * doesn't really exist because the entry will have a
-		 * recent deletion time and will not be cleaned again soon. */
+		 * recent deletion time and will not be cleaned again soon.
+		 */
 		inet_putpeer(p);
 }
 
@@ -326,16 +401,16 @@ static int cleanup_once(unsigned long ttl)
 	struct inet_peer *p = NULL;
 
 	/* Remove the first entry from the list of unused nodes. */
-	spin_lock_bh(&inet_peer_unused_lock);
-	if (!list_empty(&unused_peers)) {
+	spin_lock_bh(&unused_peers.lock);
+	if (!list_empty(&unused_peers.list)) {
 		__u32 delta;
 
-		p = list_first_entry(&unused_peers, struct inet_peer, unused);
+		p = list_first_entry(&unused_peers.list, struct inet_peer, unused);
 		delta = (__u32)jiffies - p->dtime;
 
 		if (delta < ttl) {
 			/* Do not prune fresh entries. */
-			spin_unlock_bh(&inet_peer_unused_lock);
+			spin_unlock_bh(&unused_peers.lock);
 			return -1;
 		}
 
@@ -345,7 +420,7 @@ static int cleanup_once(unsigned long ttl)
 		 * before unlink_from_pool() call. */
 		atomic_inc(&p->refcnt);
 	}
-	spin_unlock_bh(&inet_peer_unused_lock);
+	spin_unlock_bh(&unused_peers.lock);
 
 	if (p == NULL)
 		/* It means that the total number of USED entries has
@@ -360,62 +435,56 @@ static int cleanup_once(unsigned long ttl)
 /* Called with or without local BH being disabled. */
 struct inet_peer *inet_getpeer(__be32 daddr, int create)
 {
-	struct inet_peer *p, *n;
-	struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;
+	struct inet_peer *p;
+	struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
 
-	/* Look up for the address quickly. */
-	read_lock_bh(&peer_pool_lock);
-	p = lookup(daddr, NULL);
-	if (p != peer_avl_empty)
-		atomic_inc(&p->refcnt);
-	read_unlock_bh(&peer_pool_lock);
+	/* Look up for the address quickly, lockless.
+	 * Because of a concurrent writer, we might not find an existing entry.
+	 */
+	rcu_read_lock_bh();
+	p = lookup_rcu_bh(daddr);
+	rcu_read_unlock_bh();
+
+	if (p) {
+		/* The existing node has been found.
+		 * Remove the entry from unused list if it was there.
+		 */
+		unlink_from_unused(p);
+		return p;
+	}
 
+	/* retry an exact lookup, taking the lock before.
+	 * At least, nodes should be hot in our cache.
+	 */
+	spin_lock_bh(&peers.lock);
+	p = lookup(daddr, stack);
 	if (p != peer_avl_empty) {
-		/* The existing node has been found. */
+		atomic_inc(&p->refcnt);
+		spin_unlock_bh(&peers.lock);
 		/* Remove the entry from unused list if it was there. */
 		unlink_from_unused(p);
 		return p;
 	}
+	p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
+	if (p) {
+		p->v4daddr = daddr;
+		atomic_set(&p->refcnt, 1);
+		atomic_set(&p->rid, 0);
+		atomic_set(&p->ip_id_count, secure_ip_id(daddr));
+		p->tcp_ts_stamp = 0;
+		INIT_LIST_HEAD(&p->unused);
+
+
+		/* Link the node. */
+		link_to_pool(p);
+		peers.total++;
+	}
+	spin_unlock_bh(&peers.lock);
 
-	if (!create)
-		return NULL;
-
-	/* Allocate the space outside the locked region. */
-	n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
-	if (n == NULL)
-		return NULL;
-	n->v4daddr = daddr;
-	atomic_set(&n->refcnt, 1);
-	atomic_set(&n->rid, 0);
-	atomic_set(&n->ip_id_count, secure_ip_id(daddr));
-	n->tcp_ts_stamp = 0;
-
-	write_lock_bh(&peer_pool_lock);
-	/* Check if an entry has suddenly appeared. */
-	p = lookup(daddr, stack);
-	if (p != peer_avl_empty)
-		goto out_free;
-
-	/* Link the node. */
-	link_to_pool(n);
-	INIT_LIST_HEAD(&n->unused);
-	peer_total++;
-	write_unlock_bh(&peer_pool_lock);
-
-	if (peer_total >= inet_peer_threshold)
+	if (peers.total >= inet_peer_threshold)
 		/* Remove one less-recently-used entry. */
 		cleanup_once(0);
 
-	return n;
-
-out_free:
-	/* The appropriate node is already in the pool. */
-	atomic_inc(&p->refcnt);
-	write_unlock_bh(&peer_pool_lock);
-	/* Remove the entry from unused list if it was there. */
-	unlink_from_unused(p);
-	/* Free preallocated the preallocated node. */
-	kmem_cache_free(peer_cachep, n);
 	return p;
 }
 
@@ -425,12 +494,12 @@ static void peer_check_expire(unsigned long dummy)
 	unsigned long now = jiffies;
 	int ttl;
 
-	if (peer_total >= inet_peer_threshold)
+	if (peers.total >= inet_peer_threshold)
 		ttl = inet_peer_minttl;
 	else
 		ttl = inet_peer_maxttl
 				- (inet_peer_maxttl - inet_peer_minttl) / HZ *
-					peer_total / inet_peer_threshold * HZ;
+					peers.total / inet_peer_threshold * HZ;
 	while (!cleanup_once(ttl)) {
 		if (jiffies != now)
 			break;
@@ -439,22 +508,25 @@ static void peer_check_expire(unsigned long dummy)
 	/* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
 	 * interval depending on the total number of entries (more entries,
 	 * less interval). */
-	if (peer_total >= inet_peer_threshold)
+	if (peers.total >= inet_peer_threshold)
 		peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
 	else
 		peer_periodic_timer.expires = jiffies
 			+ inet_peer_gc_maxtime
 			- (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
-				peer_total / inet_peer_threshold * HZ;
+				peers.total / inet_peer_threshold * HZ;
 	add_timer(&peer_periodic_timer);
 }
 
 void inet_putpeer(struct inet_peer *p)
 {
-	spin_lock_bh(&inet_peer_unused_lock);
-	if (atomic_dec_and_test(&p->refcnt)) {
-		list_add_tail(&p->unused, &unused_peers);
+	local_bh_disable();
+
+	if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) {
+		list_add_tail(&p->unused, &unused_peers.list);
 		p->dtime = (__u32)jiffies;
+		spin_unlock(&unused_peers.lock);
 	}
-	spin_unlock_bh(&inet_peer_unused_lock);
+
+	local_bh_enable();
 }
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 56cdf68a074..99461f09320 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -87,16 +87,16 @@ int ip_forward(struct sk_buff *skb)
 	if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 		goto sr_failed;
 
-	if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) &&
+	if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
 		     (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
-		IP_INC_STATS(dev_net(rt->u.dst.dev), IPSTATS_MIB_FRAGFAILS);
+		IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
-			  htonl(dst_mtu(&rt->u.dst)));
+			  htonl(dst_mtu(&rt->dst)));
 		goto drop;
 	}
 
 	/* We are about to mangle packet. Copy it! */
-	if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
+	if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
 		goto drop;
 	iph = ip_hdr(skb);
 
@@ -113,7 +113,7 @@ int ip_forward(struct sk_buff *skb)
 	skb->priority = rt_tos2priority(iph->tos);
 
 	return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
-		       rt->u.dst.dev, ip_forward_finish);
+		       rt->dst.dev, ip_forward_finish);
 
 sr_failed:
 	/*
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 75347ea70ea..168440834ad 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -116,19 +116,16 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
 	struct ip4_create_arg *arg = a;
 
 	qp = container_of(q, struct ipq, q);
-	return (qp->id == arg->iph->id &&
+	return	qp->id == arg->iph->id &&
 			qp->saddr == arg->iph->saddr &&
 			qp->daddr == arg->iph->daddr &&
 			qp->protocol == arg->iph->protocol &&
-			qp->user == arg->user);
+			qp->user == arg->user;
 }
 
 /* Memory Tracking Functions. */
-static __inline__ void frag_kfree_skb(struct netns_frags *nf,
-		struct sk_buff *skb, int *work)
+static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
 {
-	if (work)
-		*work -= skb->truesize;
 	atomic_sub(skb->truesize, &nf->mem);
 	kfree_skb(skb);
 }
@@ -309,7 +306,7 @@ static int ip_frag_reinit(struct ipq *qp)
 	fp = qp->q.fragments;
 	do {
 		struct sk_buff *xp = fp->next;
-		frag_kfree_skb(qp->q.net, fp, NULL);
+		frag_kfree_skb(qp->q.net, fp);
 		fp = xp;
 	} while (fp);
 
@@ -317,6 +314,7 @@ static int ip_frag_reinit(struct ipq *qp)
 	qp->q.len = 0;
 	qp->q.meat = 0;
 	qp->q.fragments = NULL;
+	qp->q.fragments_tail = NULL;
 	qp->iif = 0;
 
 	return 0;
@@ -389,6 +387,11 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	 * in the chain of fragments so far.  We must know where to put
 	 * this fragment, right?
 	 */
+	prev = qp->q.fragments_tail;
+	if (!prev || FRAG_CB(prev)->offset < offset) {
+		next = NULL;
+		goto found;
+	}
 	prev = NULL;
 	for (next = qp->q.fragments; next != NULL; next = next->next) {
 		if (FRAG_CB(next)->offset >= offset)
@@ -396,6 +399,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		prev = next;
 	}
 
+found:
 	/* We found where to put this one.  Check for overlap with
 	 * preceding fragment, and, if needed, align things so that
 	 * any overlaps are eliminated.
@@ -446,7 +450,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 				qp->q.fragments = next;
 
 			qp->q.meat -= free_it->len;
-			frag_kfree_skb(qp->q.net, free_it, NULL);
+			frag_kfree_skb(qp->q.net, free_it);
 		}
 	}
 
@@ -454,6 +458,8 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 
 	/* Insert this fragment in the chain of fragments. */
 	skb->next = next;
+	if (!next)
+		qp->q.fragments_tail = skb;
 	if (prev)
 		prev->next = skb;
 	else
@@ -507,6 +513,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 			goto out_nomem;
 
 		fp->next = head->next;
+		if (!fp->next)
+			qp->q.fragments_tail = fp;
 		prev->next = fp;
 
 		skb_morph(head, qp->q.fragments);
@@ -534,7 +542,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	/* If the first fragment is fragmented itself, we split
 	 * it to two chunks: the first with data and paged part
 	 * and the second, holding only fragments. */
-	if (skb_has_frags(head)) {
+	if (skb_has_frag_list(head)) {
 		struct sk_buff *clone;
 		int i, plen = 0;
 
@@ -556,7 +564,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 
 	skb_shinfo(head)->frag_list = head->next;
 	skb_push(head, head->data - skb_network_header(head));
-	atomic_sub(head->truesize, &qp->q.net->mem);
 
 	for (fp=head->next; fp; fp = fp->next) {
 		head->data_len += fp->len;
@@ -566,8 +573,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 		else if (head->ip_summed == CHECKSUM_COMPLETE)
 			head->csum = csum_add(head->csum, fp->csum);
 		head->truesize += fp->truesize;
-		atomic_sub(fp->truesize, &qp->q.net->mem);
 	}
+	atomic_sub(head->truesize, &qp->q.net->mem);
 
 	head->next = NULL;
 	head->dev = dev;
@@ -578,6 +585,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	iph->tot_len = htons(len);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
 	qp->q.fragments = NULL;
+	qp->q.fragments_tail = NULL;
 	return 0;
 
 out_nomem:
@@ -624,6 +632,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
 	kfree_skb(skb);
 	return -ENOMEM;
 }
+EXPORT_SYMBOL(ip_defrag);
 
 #ifdef CONFIG_SYSCTL
 static int zero;
@@ -777,5 +786,3 @@ void __init ipfrag_init(void)
 	ip4_frags.secret_interval = 10 * 60 * HZ;
 	inet_frags_init(&ip4_frags);
 }
-
-EXPORT_SYMBOL(ip_defrag);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 32618e11076..70ff77f02ee 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -44,8 +44,9 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
+#include <net/gre.h>
 
-#ifdef CONFIG_IPV6
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 #include <net/ipv6.h>
 #include <net/ip6_fib.h>
 #include <net/ip6_route.h>
@@ -63,13 +64,13 @@
    We cannot track such dead loops during route installation,
    it is infeasible task. The most general solutions would be
    to keep skb->encapsulation counter (sort of local ttl),
-   and silently drop packet when it expires. It is the best
+   and silently drop packet when it expires. It is a good
    solution, but it supposes maintaing new variable in ALL
    skb, even if no tunneling is used.
 
-   Current solution: HARD_TX_LOCK lock breaks dead loops.
-
-
+   Current solution: xmit_recursion breaks dead loops. This is a percpu
+   counter, since when we enter the first ndo_xmit(), cpu migration is
+   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
 
    2. Networking dead loops would not kill routers, but would really
    kill network. IP hop limit plays role of "t->recursion" in this case,
@@ -128,7 +129,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
 
 static int ipgre_net_id __read_mostly;
 struct ipgre_net {
-	struct ip_tunnel *tunnels[4][HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
 
 	struct net_device *fb_tunnel_dev;
 };
@@ -158,13 +159,40 @@ struct ipgre_net {
 #define tunnels_l	tunnels[1]
 #define tunnels_wc	tunnels[0]
 /*
- * Locking : hash tables are protected by RCU and a spinlock
+ * Locking : hash tables are protected by RCU and RTNL
  */
-static DEFINE_SPINLOCK(ipgre_lock);
 
 #define for_each_ip_tunnel_rcu(start) \
 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
 
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+	unsigned long	rx_packets;
+	unsigned long	rx_bytes;
+	unsigned long	tx_packets;
+	unsigned long	tx_bytes;
+};
+
+static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
+{
+	struct pcpu_tstats sum = { 0 };
+	int i;
+
+	for_each_possible_cpu(i) {
+		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+		sum.rx_packets += tstats->rx_packets;
+		sum.rx_bytes   += tstats->rx_bytes;
+		sum.tx_packets += tstats->tx_packets;
+		sum.tx_bytes   += tstats->tx_bytes;
+	}
+	dev->stats.rx_packets = sum.rx_packets;
+	dev->stats.rx_bytes   = sum.rx_bytes;
+	dev->stats.tx_packets = sum.tx_packets;
+	dev->stats.tx_bytes   = sum.tx_bytes;
+	return &dev->stats;
+}
+
 /* Given src, dst and key, find appropriate for input tunnel. */
 
 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
@@ -173,8 +201,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
 {
 	struct net *net = dev_net(dev);
 	int link = dev->ifindex;
-	unsigned h0 = HASH(remote);
-	unsigned h1 = HASH(key);
+	unsigned int h0 = HASH(remote);
+	unsigned int h1 = HASH(key);
 	struct ip_tunnel *t, *cand = NULL;
 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
@@ -289,13 +317,13 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
 	return NULL;
 }
 
-static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
+static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
 		struct ip_tunnel_parm *parms)
 {
 	__be32 remote = parms->iph.daddr;
 	__be32 local = parms->iph.saddr;
 	__be32 key = parms->i_key;
-	unsigned h = HASH(key);
+	unsigned int h = HASH(key);
 	int prio = 0;
 
 	if (local)
@@ -308,7 +336,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
 	return &ign->tunnels[prio][h];
 }
 
-static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
+static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
 		struct ip_tunnel *t)
 {
 	return __ipgre_bucket(ign, &t->parms);
@@ -316,23 +344,22 @@ static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
 
 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
 {
-	struct ip_tunnel **tp = ipgre_bucket(ign, t);
+	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
 
-	spin_lock_bh(&ipgre_lock);
-	t->next = *tp;
+	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
 	rcu_assign_pointer(*tp, t);
-	spin_unlock_bh(&ipgre_lock);
 }
 
 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
 {
-	struct ip_tunnel **tp;
-
-	for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
-		if (t == *tp) {
-			spin_lock_bh(&ipgre_lock);
-			*tp = t->next;
-			spin_unlock_bh(&ipgre_lock);
+	struct ip_tunnel __rcu **tp;
+	struct ip_tunnel *iter;
+
+	for (tp = ipgre_bucket(ign, t);
+	     (iter = rtnl_dereference(*tp)) != NULL;
+	     tp = &iter->next) {
+		if (t == iter) {
+			rcu_assign_pointer(*tp, t->next);
 			break;
 		}
 	}
@@ -346,10 +373,13 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
 	__be32 local = parms->iph.saddr;
 	__be32 key = parms->i_key;
 	int link = parms->link;
-	struct ip_tunnel *t, **tp;
+	struct ip_tunnel *t;
+	struct ip_tunnel __rcu **tp;
 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 
-	for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
+	for (tp = __ipgre_bucket(ign, parms);
+	     (t = rtnl_dereference(*tp)) != NULL;
+	     tp = &t->next)
 		if (local == t->parms.iph.saddr &&
 		    remote == t->parms.iph.daddr &&
 		    key == t->parms.i_key &&
@@ -360,7 +390,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
 	return t;
 }
 
-static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
+static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
 		struct ip_tunnel_parm *parms, int create)
 {
 	struct ip_tunnel *t, *nt;
@@ -582,7 +612,7 @@ static int ipgre_rcv(struct sk_buff *skb)
 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
 					  iph->saddr, iph->daddr, key,
 					  gre_proto))) {
-		struct net_device_stats *stats = &tunnel->dev->stats;
+		struct pcpu_tstats *tstats;
 
 		secpath_reset(skb);
 
@@ -606,22 +636,22 @@ static int ipgre_rcv(struct sk_buff *skb)
 			/* Looped back packet, drop it! */
 			if (skb_rtable(skb)->fl.iif == 0)
 				goto drop;
-			stats->multicast++;
+			tunnel->dev->stats.multicast++;
 			skb->pkt_type = PACKET_BROADCAST;
 		}
 #endif
 
 		if (((flags&GRE_CSUM) && csum) ||
 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
-			stats->rx_crc_errors++;
-			stats->rx_errors++;
+			tunnel->dev->stats.rx_crc_errors++;
+			tunnel->dev->stats.rx_errors++;
 			goto drop;
 		}
 		if (tunnel->parms.i_flags&GRE_SEQ) {
 			if (!(flags&GRE_SEQ) ||
 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
-				stats->rx_fifo_errors++;
-				stats->rx_errors++;
+				tunnel->dev->stats.rx_fifo_errors++;
+				tunnel->dev->stats.rx_errors++;
 				goto drop;
 			}
 			tunnel->i_seqno = seqno + 1;
@@ -630,8 +660,8 @@ static int ipgre_rcv(struct sk_buff *skb)
 		/* Warning: All skb pointers will be invalidated! */
 		if (tunnel->dev->type == ARPHRD_ETHER) {
 			if (!pskb_may_pull(skb, ETH_HLEN)) {
-				stats->rx_length_errors++;
-				stats->rx_errors++;
+				tunnel->dev->stats.rx_length_errors++;
+				tunnel->dev->stats.rx_errors++;
 				goto drop;
 			}
 
@@ -640,14 +670,19 @@ static int ipgre_rcv(struct sk_buff *skb)
 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 		}
 
-		skb_tunnel_rx(skb, tunnel->dev);
+		tstats = this_cpu_ptr(tunnel->dev->tstats);
+		tstats->rx_packets++;
+		tstats->rx_bytes += skb->len;
+
+		__skb_tunnel_rx(skb, tunnel->dev);
 
 		skb_reset_network_header(skb);
 		ipgre_ecn_decapsulate(iph, skb);
 
 		netif_rx(skb);
+
 		rcu_read_unlock();
-		return(0);
+		return 0;
 	}
 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 
@@ -655,20 +690,19 @@ drop:
 	rcu_read_unlock();
 drop_nolock:
 	kfree_skb(skb);
-	return(0);
+	return 0;
 }
 
 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
-	struct net_device_stats *stats = &dev->stats;
-	struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+	struct pcpu_tstats *tstats;
 	struct iphdr  *old_iph = ip_hdr(skb);
 	struct iphdr  *tiph;
 	u8     tos;
 	__be16 df;
 	struct rtable *rt;     			/* Route to the other host */
-	struct net_device *tdev;			/* Device to other host */
+	struct net_device *tdev;		/* Device to other host */
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
 	int    gre_hlen;
@@ -690,7 +724,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 		/* NBMA tunnel */
 
 		if (skb_dst(skb) == NULL) {
-			stats->tx_fifo_errors++;
+			dev->stats.tx_fifo_errors++;
 			goto tx_error;
 		}
 
@@ -699,7 +733,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 			if ((dst = rt->rt_gateway) == 0)
 				goto tx_error_icmp;
 		}
-#ifdef CONFIG_IPV6
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		else if (skb->protocol == htons(ETH_P_IPV6)) {
 			struct in6_addr *addr6;
 			int addr_type;
@@ -731,31 +765,39 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 		tos = 0;
 		if (skb->protocol == htons(ETH_P_IP))
 			tos = old_iph->tos;
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
 	}
 
 	{
-		struct flowi fl = { .oif = tunnel->parms.link,
-				    .nl_u = { .ip4_u =
-					      { .daddr = dst,
-						.saddr = tiph->saddr,
-						.tos = RT_TOS(tos) } },
-				    .proto = IPPROTO_GRE };
+		struct flowi fl = {
+			.oif = tunnel->parms.link,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = dst,
+					.saddr = tiph->saddr,
+					.tos = RT_TOS(tos)
+				}
+			},
+			.proto = IPPROTO_GRE
+		}
+;
 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
-			stats->tx_carrier_errors++;
+			dev->stats.tx_carrier_errors++;
 			goto tx_error;
 		}
 	}
-	tdev = rt->u.dst.dev;
+	tdev = rt->dst.dev;
 
 	if (tdev == dev) {
 		ip_rt_put(rt);
-		stats->collisions++;
+		dev->stats.collisions++;
 		goto tx_error;
 	}
 
 	df = tiph->frag_off;
 	if (df)
-		mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
+		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
 	else
 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 
@@ -772,7 +814,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 			goto tx_error;
 		}
 	}
-#ifdef CONFIG_IPV6
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	else if (skb->protocol == htons(ETH_P_IPV6)) {
 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
 
@@ -803,7 +845,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 			tunnel->err_count = 0;
 	}
 
-	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len;
+	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
 
 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
@@ -812,7 +854,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 			dev->needed_headroom = max_headroom;
 		if (!new_skb) {
 			ip_rt_put(rt);
-			txq->tx_dropped++;
+			dev->stats.tx_dropped++;
 			dev_kfree_skb(skb);
 			return NETDEV_TX_OK;
 		}
@@ -830,7 +872,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 			      IPSKB_REROUTED);
 	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 
 	/*
 	 *	Push down and install the IPIP header.
@@ -848,12 +890,12 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 	if ((iph->ttl = tiph->ttl) == 0) {
 		if (skb->protocol == htons(ETH_P_IP))
 			iph->ttl = old_iph->ttl;
-#ifdef CONFIG_IPV6
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		else if (skb->protocol == htons(ETH_P_IPV6))
 			iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
 #endif
 		else
-			iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
+			iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT);
 	}
 
 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
@@ -879,15 +921,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 	}
 
 	nf_reset(skb);
-
-	IPTUNNEL_XMIT();
+	tstats = this_cpu_ptr(dev->tstats);
+	__IPTUNNEL_XMIT(tstats, &dev->stats);
 	return NETDEV_TX_OK;
 
 tx_error_icmp:
 	dst_link_failure(skb);
 
 tx_error:
-	stats->tx_errors++;
+	dev->stats.tx_errors++;
 	dev_kfree_skb(skb);
 	return NETDEV_TX_OK;
 }
@@ -907,15 +949,21 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
 	/* Guess output device to choose reasonable mtu and needed_headroom */
 
 	if (iph->daddr) {
-		struct flowi fl = { .oif = tunnel->parms.link,
-				    .nl_u = { .ip4_u =
-					      { .daddr = iph->daddr,
-						.saddr = iph->saddr,
-						.tos = RT_TOS(iph->tos) } },
-				    .proto = IPPROTO_GRE };
+		struct flowi fl = {
+			.oif = tunnel->parms.link,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = iph->daddr,
+					.saddr = iph->saddr,
+					.tos = RT_TOS(iph->tos)
+				}
+			},
+			.proto = IPPROTO_GRE
+		};
 		struct rtable *rt;
+
 		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
-			tdev = rt->u.dst.dev;
+			tdev = rt->dst.dev;
 			ip_rt_put(rt);
 		}
 
@@ -1010,7 +1058,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 					break;
 				}
 			} else {
-				unsigned nflags = 0;
+				unsigned int nflags = 0;
 
 				t = netdev_priv(dev);
 
@@ -1024,6 +1072,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 					break;
 				}
 				ipgre_tunnel_unlink(ign, t);
+				synchronize_net();
 				t->parms.iph.saddr = p.iph.saddr;
 				t->parms.iph.daddr = p.iph.daddr;
 				t->parms.i_key = p.i_key;
@@ -1123,7 +1172,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
 
 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
 			unsigned short type,
-			const void *daddr, const void *saddr, unsigned len)
+			const void *daddr, const void *saddr, unsigned int len)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
@@ -1165,16 +1214,22 @@ static int ipgre_open(struct net_device *dev)
 	struct ip_tunnel *t = netdev_priv(dev);
 
 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
-		struct flowi fl = { .oif = t->parms.link,
-				    .nl_u = { .ip4_u =
-					      { .daddr = t->parms.iph.daddr,
-						.saddr = t->parms.iph.saddr,
-						.tos = RT_TOS(t->parms.iph.tos) } },
-				    .proto = IPPROTO_GRE };
+		struct flowi fl = {
+			.oif = t->parms.link,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = t->parms.iph.daddr,
+					.saddr = t->parms.iph.saddr,
+					.tos = RT_TOS(t->parms.iph.tos)
+				}
+			},
+			.proto = IPPROTO_GRE
+		};
 		struct rtable *rt;
+
 		if (ip_route_output_key(dev_net(dev), &rt, &fl))
 			return -EADDRNOTAVAIL;
-		dev = rt->u.dst.dev;
+		dev = rt->dst.dev;
 		ip_rt_put(rt);
 		if (__in_dev_get_rtnl(dev) == NULL)
 			return -EADDRNOTAVAIL;
@@ -1191,10 +1246,8 @@ static int ipgre_close(struct net_device *dev)
 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
 		struct in_device *in_dev;
 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
-		if (in_dev) {
+		if (in_dev)
 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
-			in_dev_put(in_dev);
-		}
 	}
 	return 0;
 }
@@ -1211,12 +1264,19 @@ static const struct net_device_ops ipgre_netdev_ops = {
 	.ndo_start_xmit		= ipgre_tunnel_xmit,
 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
+	.ndo_get_stats		= ipgre_get_stats,
 };
 
+static void ipgre_dev_free(struct net_device *dev)
+{
+	free_percpu(dev->tstats);
+	free_netdev(dev);
+}
+
 static void ipgre_tunnel_setup(struct net_device *dev)
 {
 	dev->netdev_ops		= &ipgre_netdev_ops;
-	dev->destructor 	= free_netdev;
+	dev->destructor 	= ipgre_dev_free;
 
 	dev->type		= ARPHRD_IPGRE;
 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
@@ -1254,6 +1314,10 @@ static int ipgre_tunnel_init(struct net_device *dev)
 	} else
 		dev->header_ops = &ipgre_header_ops;
 
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
 	return 0;
 }
 
@@ -1261,7 +1325,6 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	struct iphdr *iph = &tunnel->parms.iph;
-	struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
 
 	tunnel->dev = dev;
 	strcpy(tunnel->parms.name, dev->name);
@@ -1272,14 +1335,12 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
 	tunnel->hlen		= sizeof(struct iphdr) + 4;
 
 	dev_hold(dev);
-	ign->tunnels_wc[0]	= tunnel;
 }
 
 
-static const struct net_protocol ipgre_protocol = {
-	.handler	=	ipgre_rcv,
-	.err_handler	=	ipgre_err,
-	.netns_ok	=	1,
+static const struct gre_protocol ipgre_protocol = {
+	.handler     = ipgre_rcv,
+	.err_handler = ipgre_err,
 };
 
 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
@@ -1289,11 +1350,13 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
 	for (prio = 0; prio < 4; prio++) {
 		int h;
 		for (h = 0; h < HASH_SIZE; h++) {
-			struct ip_tunnel *t = ign->tunnels[prio][h];
+			struct ip_tunnel *t;
+
+			t = rtnl_dereference(ign->tunnels[prio][h]);
 
 			while (t != NULL) {
 				unregister_netdevice_queue(t->dev, head);
-				t = t->next;
+				t = rtnl_dereference(t->next);
 			}
 		}
 	}
@@ -1318,10 +1381,12 @@ static int __net_init ipgre_init_net(struct net *net)
 	if ((err = register_netdev(ign->fb_tunnel_dev)))
 		goto err_reg_dev;
 
+	rcu_assign_pointer(ign->tunnels_wc[0],
+			   netdev_priv(ign->fb_tunnel_dev));
 	return 0;
 
 err_reg_dev:
-	free_netdev(ign->fb_tunnel_dev);
+	ipgre_dev_free(ign->fb_tunnel_dev);
 err_alloc_dev:
 	return err;
 }
@@ -1439,6 +1504,10 @@ static int ipgre_tap_init(struct net_device *dev)
 
 	ipgre_tunnel_bind_dev(dev);
 
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
 	return 0;
 }
 
@@ -1449,6 +1518,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = {
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
+	.ndo_get_stats		= ipgre_get_stats,
 };
 
 static void ipgre_tap_setup(struct net_device *dev)
@@ -1457,7 +1527,7 @@ static void ipgre_tap_setup(struct net_device *dev)
 	ether_setup(dev);
 
 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
-	dev->destructor 	= free_netdev;
+	dev->destructor 	= ipgre_dev_free;
 
 	dev->iflink		= 0;
 	dev->features		|= NETIF_F_NETNS_LOCAL;
@@ -1485,6 +1555,10 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla
 	if (!tb[IFLA_MTU])
 		dev->mtu = mtu;
 
+	/* Can use a lockless transmit, unless we generate output sequences */
+	if (!(nt->parms.o_flags & GRE_SEQ))
+		dev->features |= NETIF_F_LLTX;
+
 	err = register_netdevice(dev);
 	if (err)
 		goto out;
@@ -1520,7 +1594,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
 		t = nt;
 
 		if (dev->type != ARPHRD_ETHER) {
-			unsigned nflags = 0;
+			unsigned int nflags = 0;
 
 			if (ipv4_is_multicast(p.iph.daddr))
 				nflags = IFF_BROADCAST;
@@ -1661,7 +1735,7 @@ static int __init ipgre_init(void)
 	if (err < 0)
 		return err;
 
-	err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE);
+	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
 	if (err < 0) {
 		printk(KERN_INFO "ipgre init: can't add protocol\n");
 		goto add_proto_failed;
@@ -1681,7 +1755,7 @@ out:
 tap_ops_failed:
 	rtnl_link_unregister(&ipgre_link_ops);
 rtnl_link_failed:
-	inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
+	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
 add_proto_failed:
 	unregister_pernet_device(&ipgre_net_ops);
 	goto out;
@@ -1691,7 +1765,7 @@ static void __exit ipgre_fini(void)
 {
 	rtnl_link_unregister(&ipgre_tap_ops);
 	rtnl_link_unregister(&ipgre_link_ops);
-	if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
+	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
 	unregister_pernet_device(&ipgre_net_ops);
 }
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d930dc5e4d8..d859bcc26cb 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -146,7 +146,7 @@
 #include <linux/netlink.h>
 
 /*
- *	Process Router Attention IP option
+ *	Process Router Attention IP option (RFC 2113)
  */
 int ip_call_ra_chain(struct sk_buff *skb)
 {
@@ -155,8 +155,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
 	struct sock *last = NULL;
 	struct net_device *dev = skb->dev;
 
-	read_lock(&ip_ra_lock);
-	for (ra = ip_ra_chain; ra; ra = ra->next) {
+	for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
 		struct sock *sk = ra->sk;
 
 		/* If socket is bound to an interface, only report
@@ -167,10 +166,8 @@ int ip_call_ra_chain(struct sk_buff *skb)
 		     sk->sk_bound_dev_if == dev->ifindex) &&
 		    net_eq(sock_net(sk), dev_net(dev))) {
 			if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-				if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) {
-					read_unlock(&ip_ra_lock);
+				if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
 					return 1;
-				}
 			}
 			if (last) {
 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -183,10 +180,8 @@ int ip_call_ra_chain(struct sk_buff *skb)
 
 	if (last) {
 		raw_rcv(last, skb);
-		read_unlock(&ip_ra_lock);
 		return 1;
 	}
-	read_unlock(&ip_ra_lock);
 	return 0;
 }
 
@@ -298,18 +293,16 @@ static inline int ip_rcv_options(struct sk_buff *skb)
 	}
 
 	if (unlikely(opt->srr)) {
-		struct in_device *in_dev = in_dev_get(dev);
+		struct in_device *in_dev = __in_dev_get_rcu(dev);
+
 		if (in_dev) {
 			if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
 				if (IN_DEV_LOG_MARTIANS(in_dev) &&
 				    net_ratelimit())
 					printk(KERN_INFO "source route option %pI4 -> %pI4\n",
 					       &iph->saddr, &iph->daddr);
-				in_dev_put(in_dev);
 				goto drop;
 			}
-
-			in_dev_put(in_dev);
 		}
 
 		if (ip_options_rcv_srr(skb))
@@ -340,13 +333,16 @@ static int ip_rcv_finish(struct sk_buff *skb)
 			else if (err == -ENETUNREACH)
 				IP_INC_STATS_BH(dev_net(skb->dev),
 						IPSTATS_MIB_INNOROUTES);
+			else if (err == -EXDEV)
+				NET_INC_STATS_BH(dev_net(skb->dev),
+						 LINUX_MIB_IPRPFILTER);
 			goto drop;
 		}
 	}
 
 #ifdef CONFIG_NET_CLS_ROUTE
 	if (unlikely(skb_dst(skb)->tclassid)) {
-		struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
+		struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
 		u32 idx = skb_dst(skb)->tclassid;
 		st[idx&0xFF].o_packets++;
 		st[idx&0xFF].o_bytes += skb->len;
@@ -360,10 +356,10 @@ static int ip_rcv_finish(struct sk_buff *skb)
 
 	rt = skb_rtable(skb);
 	if (rt->rt_type == RTN_MULTICAST) {
-		IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST,
+		IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
 				skb->len);
 	} else if (rt->rt_type == RTN_BROADCAST)
-		IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST,
+		IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
 				skb->len);
 
 	return dst_input(skb);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ba9836c488e..1906fa35860 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -466,7 +466,7 @@ error:
 	}
 	return -EINVAL;
 }
-
+EXPORT_SYMBOL(ip_options_compile);
 
 /*
  *	Undo all the changes done by ip_options_compile().
@@ -646,3 +646,4 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 	}
 	return 0;
 }
+EXPORT_SYMBOL(ip_options_rcv_srr);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 041d41df122..439d2a34ee4 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -89,6 +89,7 @@ __inline__ void ip_send_check(struct iphdr *iph)
 	iph->check = 0;
 	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 }
+EXPORT_SYMBOL(ip_send_check);
 
 int __ip_local_out(struct sk_buff *skb)
 {
@@ -151,15 +152,15 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 	iph->version  = 4;
 	iph->ihl      = 5;
 	iph->tos      = inet->tos;
-	if (ip_dont_fragment(sk, &rt->u.dst))
+	if (ip_dont_fragment(sk, &rt->dst))
 		iph->frag_off = htons(IP_DF);
 	else
 		iph->frag_off = 0;
-	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
+	iph->ttl      = ip_select_ttl(inet, &rt->dst);
 	iph->daddr    = rt->rt_dst;
 	iph->saddr    = rt->rt_src;
 	iph->protocol = sk->sk_protocol;
-	ip_select_ident(iph, &rt->u.dst, sk);
+	ip_select_ident(iph, &rt->dst, sk);
 
 	if (opt && opt->optlen) {
 		iph->ihl += opt->optlen>>2;
@@ -172,7 +173,6 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 	/* Send it out. */
 	return ip_local_out(skb);
 }
-
 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 
 static inline int ip_finish_output2(struct sk_buff *skb)
@@ -240,7 +240,7 @@ int ip_mc_output(struct sk_buff *skb)
 {
 	struct sock *sk = skb->sk;
 	struct rtable *rt = skb_rtable(skb);
-	struct net_device *dev = rt->u.dst.dev;
+	struct net_device *dev = rt->dst.dev;
 
 	/*
 	 *	If the indicated interface is up and running, send the packet.
@@ -359,9 +359,9 @@ int ip_queue_xmit(struct sk_buff *skb)
 			if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
 				goto no_route;
 		}
-		sk_setup_caps(sk, &rt->u.dst);
+		sk_setup_caps(sk, &rt->dst);
 	}
-	skb_dst_set_noref(skb, &rt->u.dst);
+	skb_dst_set_noref(skb, &rt->dst);
 
 packet_routed:
 	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
@@ -372,11 +372,11 @@ packet_routed:
 	skb_reset_network_header(skb);
 	iph = ip_hdr(skb);
 	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
-	if (ip_dont_fragment(sk, &rt->u.dst) && !skb->local_df)
+	if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
 		iph->frag_off = htons(IP_DF);
 	else
 		iph->frag_off = 0;
-	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
+	iph->ttl      = ip_select_ttl(inet, &rt->dst);
 	iph->protocol = sk->sk_protocol;
 	iph->saddr    = rt->rt_src;
 	iph->daddr    = rt->rt_dst;
@@ -387,7 +387,7 @@ packet_routed:
 		ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
 	}
 
-	ip_select_ident_more(iph, &rt->u.dst, sk,
+	ip_select_ident_more(iph, &rt->dst, sk,
 			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 
 	skb->priority = sk->sk_priority;
@@ -403,6 +403,7 @@ no_route:
 	kfree_skb(skb);
 	return -EHOSTUNREACH;
 }
+EXPORT_SYMBOL(ip_queue_xmit);
 
 
 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
@@ -411,7 +412,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->priority = from->priority;
 	to->protocol = from->protocol;
 	skb_dst_drop(to);
-	skb_dst_set(to, dst_clone(skb_dst(from)));
+	skb_dst_copy(to, from);
 	to->dev = from->dev;
 	to->mark = from->mark;
 
@@ -442,17 +443,16 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 {
 	struct iphdr *iph;
-	int raw = 0;
 	int ptr;
 	struct net_device *dev;
 	struct sk_buff *skb2;
-	unsigned int mtu, hlen, left, len, ll_rs, pad;
+	unsigned int mtu, hlen, left, len, ll_rs;
 	int offset;
 	__be16 not_last_frag;
 	struct rtable *rt = skb_rtable(skb);
 	int err = 0;
 
-	dev = rt->u.dst.dev;
+	dev = rt->dst.dev;
 
 	/*
 	 *	Point into the IP datagram header.
@@ -473,7 +473,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	 */
 
 	hlen = iph->ihl * 4;
-	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
+	mtu = dst_mtu(&rt->dst) - hlen;	/* Size of data space */
 #ifdef CONFIG_BRIDGE_NETFILTER
 	if (skb->nf_bridge)
 		mtu -= nf_bridge_mtu_reduction(skb);
@@ -487,10 +487,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	 * LATER: this step can be merged to real generation of fragments,
 	 * we can switch to copy when see the first bad fragment.
 	 */
-	if (skb_has_frags(skb)) {
-		struct sk_buff *frag;
+	if (skb_has_frag_list(skb)) {
+		struct sk_buff *frag, *frag2;
 		int first_len = skb_pagelen(skb);
-		int truesizes = 0;
 
 		if (first_len - hlen > mtu ||
 		    ((first_len - hlen) & 7) ||
@@ -503,18 +502,18 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 			if (frag->len > mtu ||
 			    ((frag->len & 7) && frag->next) ||
 			    skb_headroom(frag) < hlen)
-			    goto slow_path;
+				goto slow_path_clean;
 
 			/* Partially cloned skb? */
 			if (skb_shared(frag))
-				goto slow_path;
+				goto slow_path_clean;
 
 			BUG_ON(frag->sk);
 			if (skb->sk) {
 				frag->sk = skb->sk;
 				frag->destructor = sock_wfree;
 			}
-			truesizes += frag->truesize;
+			skb->truesize -= frag->truesize;
 		}
 
 		/* Everything is OK. Generate! */
@@ -524,7 +523,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 		frag = skb_shinfo(skb)->frag_list;
 		skb_frag_list_init(skb);
 		skb->data_len = first_len - skb_headlen(skb);
-		skb->truesize -= truesizes;
 		skb->len = first_len;
 		iph->tot_len = htons(first_len);
 		iph->frag_off = htons(IP_MF);
@@ -576,18 +574,25 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 		}
 		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 		return err;
+
+slow_path_clean:
+		skb_walk_frags(skb, frag2) {
+			if (frag2 == frag)
+				break;
+			frag2->sk = NULL;
+			frag2->destructor = NULL;
+			skb->truesize += frag2->truesize;
+		}
 	}
 
 slow_path:
 	left = skb->len - hlen;		/* Space per frame */
-	ptr = raw + hlen;		/* Where to start from */
+	ptr = hlen;		/* Where to start from */
 
 	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
 	 * we need to make room for the encapsulating header
 	 */
-	pad = nf_bridge_pad(skb);
-	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
-	mtu -= pad;
+	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
 
 	/*
 	 *	Fragment the datagram.
@@ -697,7 +702,6 @@ fail:
 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 	return err;
 }
-
 EXPORT_SYMBOL(ip_fragment);
 
 int
@@ -716,6 +720,7 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
 	}
 	return 0;
 }
+EXPORT_SYMBOL(ip_generic_getfrag);
 
 static inline __wsum
 csum_page(struct page *page, int offset, int copy)
@@ -833,16 +838,15 @@ int ip_append_data(struct sock *sk,
 		 */
 		*rtp = NULL;
 		inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
-					    rt->u.dst.dev->mtu :
-					    dst_mtu(rt->u.dst.path);
-		inet->cork.dst = &rt->u.dst;
+					    rt->dst.dev->mtu :
+					    dst_mtu(rt->dst.path);
+		inet->cork.dst = &rt->dst;
 		inet->cork.length = 0;
 		sk->sk_sndmsg_page = NULL;
 		sk->sk_sndmsg_off = 0;
-		if ((exthdrlen = rt->u.dst.header_len) != 0) {
-			length += exthdrlen;
-			transhdrlen += exthdrlen;
-		}
+		exthdrlen = rt->dst.header_len;
+		length += exthdrlen;
+		transhdrlen += exthdrlen;
 	} else {
 		rt = (struct rtable *)inet->cork.dst;
 		if (inet->cork.flags & IPCORK_OPT)
@@ -852,7 +856,7 @@ int ip_append_data(struct sock *sk,
 		exthdrlen = 0;
 		mtu = inet->cork.fragsize;
 	}
-	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
 
 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
@@ -869,7 +873,7 @@ int ip_append_data(struct sock *sk,
 	 */
 	if (transhdrlen &&
 	    length + fragheaderlen <= mtu &&
-	    rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
+	    rt->dst.dev->features & NETIF_F_V4_CSUM &&
 	    !exthdrlen)
 		csummode = CHECKSUM_PARTIAL;
 
@@ -878,7 +882,7 @@ int ip_append_data(struct sock *sk,
 	inet->cork.length += length;
 	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
 	    (sk->sk_protocol == IPPROTO_UDP) &&
-	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
+	    (rt->dst.dev->features & NETIF_F_UFO)) {
 		err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 					 fragheaderlen, transhdrlen, mtu,
 					 flags);
@@ -926,19 +930,22 @@ alloc_new_skb:
 			fraglen = datalen + fragheaderlen;
 
 			if ((flags & MSG_MORE) &&
-			    !(rt->u.dst.dev->features&NETIF_F_SG))
+			    !(rt->dst.dev->features&NETIF_F_SG))
 				alloclen = mtu;
 			else
-				alloclen = datalen + fragheaderlen;
+				alloclen = fraglen;
 
 			/* The last fragment gets additional space at tail.
 			 * Note, with MSG_MORE we overallocate on fragments,
 			 * because we have no idea what fragment will be
 			 * the last.
 			 */
-			if (datalen == length + fraggap)
-				alloclen += rt->u.dst.trailer_len;
-
+			if (datalen == length + fraggap) {
+				alloclen += rt->dst.trailer_len;
+				/* make sure mtu is not reached */
+				if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
+					datalen -= ALIGN(rt->dst.trailer_len, 8);
+			}
 			if (transhdrlen) {
 				skb = sock_alloc_send_skb(sk,
 						alloclen + hh_len + 15,
@@ -955,7 +962,7 @@ alloc_new_skb:
 				else
 					/* only the initial fragment is
 					   time stamped */
-					ipc->shtx.flags = 0;
+					ipc->tx_flags = 0;
 			}
 			if (skb == NULL)
 				goto error;
@@ -966,7 +973,7 @@ alloc_new_skb:
 			skb->ip_summed = csummode;
 			skb->csum = 0;
 			skb_reserve(skb, hh_len);
-			*skb_tx(skb) = ipc->shtx;
+			skb_shinfo(skb)->tx_flags = ipc->tx_flags;
 
 			/*
 			 *	Find where to start putting bytes.
@@ -1010,7 +1017,7 @@ alloc_new_skb:
 		if (copy > length)
 			copy = length;
 
-		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
+		if (!(rt->dst.dev->features&NETIF_F_SG)) {
 			unsigned int off;
 
 			off = skb->len;
@@ -1105,10 +1112,10 @@ ssize_t	ip_append_page(struct sock *sk, struct page *page,
 	if (inet->cork.flags & IPCORK_OPT)
 		opt = inet->cork.opt;
 
-	if (!(rt->u.dst.dev->features&NETIF_F_SG))
+	if (!(rt->dst.dev->features&NETIF_F_SG))
 		return -EOPNOTSUPP;
 
-	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
 	mtu = inet->cork.fragsize;
 
 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
@@ -1125,7 +1132,7 @@ ssize_t	ip_append_page(struct sock *sk, struct page *page,
 	inet->cork.length += size;
 	if ((size + skb->len > mtu) &&
 	    (sk->sk_protocol == IPPROTO_UDP) &&
-	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
+	    (rt->dst.dev->features & NETIF_F_UFO)) {
 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 	}
@@ -1277,8 +1284,8 @@ int ip_push_pending_frames(struct sock *sk)
 	 * If local_df is set too, we still allow to fragment this frame
 	 * locally. */
 	if (inet->pmtudisc >= IP_PMTUDISC_DO ||
-	    (skb->len <= dst_mtu(&rt->u.dst) &&
-	     ip_dont_fragment(sk, &rt->u.dst)))
+	    (skb->len <= dst_mtu(&rt->dst) &&
+	     ip_dont_fragment(sk, &rt->dst)))
 		df = htons(IP_DF);
 
 	if (inet->cork.flags & IPCORK_OPT)
@@ -1287,7 +1294,7 @@ int ip_push_pending_frames(struct sock *sk)
 	if (rt->rt_type == RTN_MULTICAST)
 		ttl = inet->mc_ttl;
 	else
-		ttl = ip_select_ttl(inet, &rt->u.dst);
+		ttl = ip_select_ttl(inet, &rt->dst);
 
 	iph = (struct iphdr *)skb->data;
 	iph->version = 4;
@@ -1298,7 +1305,7 @@ int ip_push_pending_frames(struct sock *sk)
 	}
 	iph->tos = inet->tos;
 	iph->frag_off = df;
-	ip_select_ident(iph, &rt->u.dst, sk);
+	ip_select_ident(iph, &rt->dst, sk);
 	iph->ttl = ttl;
 	iph->protocol = sk->sk_protocol;
 	iph->saddr = rt->rt_src;
@@ -1311,7 +1318,7 @@ int ip_push_pending_frames(struct sock *sk)
 	 * on dst refcount
 	 */
 	inet->cork.dst = NULL;
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 
 	if (iph->protocol == IPPROTO_ICMP)
 		icmp_out_count(net, ((struct icmphdr *)
@@ -1386,7 +1393,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
 
 	daddr = ipc.addr = rt->rt_src;
 	ipc.opt = NULL;
-	ipc.shtx.flags = 0;
+	ipc.tx_flags = 0;
 
 	if (replyopts.opt.optlen) {
 		ipc.opt = &replyopts.opt;
@@ -1448,7 +1455,3 @@ void __init ip_init(void)
 	igmp_mc_proc_init();
 #endif
 }
-
-EXPORT_SYMBOL(ip_generic_getfrag);
-EXPORT_SYMBOL(ip_queue_xmit);
-EXPORT_SYMBOL(ip_send_check);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index ce231780a2b..3948c86e59c 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -238,48 +238,68 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
    but receiver should be enough clever f.e. to forward mtrace requests,
    sent to multicast group to reach destination designated router.
  */
-struct ip_ra_chain *ip_ra_chain;
-DEFINE_RWLOCK(ip_ra_lock);
+struct ip_ra_chain __rcu *ip_ra_chain;
+static DEFINE_SPINLOCK(ip_ra_lock);
+
+
+static void ip_ra_destroy_rcu(struct rcu_head *head)
+{
+	struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
+
+	sock_put(ra->saved_sk);
+	kfree(ra);
+}
 
 int ip_ra_control(struct sock *sk, unsigned char on,
 		  void (*destructor)(struct sock *))
 {
-	struct ip_ra_chain *ra, *new_ra, **rap;
+	struct ip_ra_chain *ra, *new_ra;
+	struct ip_ra_chain __rcu **rap;
 
 	if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
 		return -EINVAL;
 
 	new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
 
-	write_lock_bh(&ip_ra_lock);
-	for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) {
+	spin_lock_bh(&ip_ra_lock);
+	for (rap = &ip_ra_chain;
+	     (ra = rcu_dereference_protected(*rap,
+			lockdep_is_held(&ip_ra_lock))) != NULL;
+	     rap = &ra->next) {
 		if (ra->sk == sk) {
 			if (on) {
-				write_unlock_bh(&ip_ra_lock);
+				spin_unlock_bh(&ip_ra_lock);
 				kfree(new_ra);
 				return -EADDRINUSE;
 			}
-			*rap = ra->next;
-			write_unlock_bh(&ip_ra_lock);
+			/* dont let ip_call_ra_chain() use sk again */
+			ra->sk = NULL;
+			rcu_assign_pointer(*rap, ra->next);
+			spin_unlock_bh(&ip_ra_lock);
 
 			if (ra->destructor)
 				ra->destructor(sk);
-			sock_put(sk);
-			kfree(ra);
+			/*
+			 * Delay sock_put(sk) and kfree(ra) after one rcu grace
+			 * period. This guarantee ip_call_ra_chain() dont need
+			 * to mess with socket refcounts.
+			 */
+			ra->saved_sk = sk;
+			call_rcu(&ra->rcu, ip_ra_destroy_rcu);
 			return 0;
 		}
 	}
 	if (new_ra == NULL) {
-		write_unlock_bh(&ip_ra_lock);
+		spin_unlock_bh(&ip_ra_lock);
 		return -ENOBUFS;
 	}
 	new_ra->sk = sk;
 	new_ra->destructor = destructor;
 
 	new_ra->next = ra;
-	*rap = new_ra;
+	rcu_assign_pointer(*rap, new_ra);
 	sock_hold(sk);
-	write_unlock_bh(&ip_ra_lock);
+	spin_unlock_bh(&ip_ra_lock);
 
 	return 0;
 }
@@ -449,7 +469,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			     (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
 			     (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
 			     (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) |
-			     (1<<IP_MINTTL))) ||
+			     (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) ||
 	    optname == IP_MULTICAST_TTL ||
 	    optname == IP_MULTICAST_ALL ||
 	    optname == IP_MULTICAST_LOOP ||
@@ -572,6 +592,13 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		}
 		inet->hdrincl = val ? 1 : 0;
 		break;
+	case IP_NODEFRAG:
+		if (sk->sk_type != SOCK_RAW) {
+			err = -ENOPROTOOPT;
+			break;
+		}
+		inet->nodefrag = val ? 1 : 0;
+		break;
 	case IP_MTU_DISCOVER:
 		if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
 			goto e_inval;
@@ -1106,6 +1133,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_HDRINCL:
 		val = inet->hdrincl;
 		break;
+	case IP_NODEFRAG:
+		val = inet->nodefrag;
+		break;
 	case IP_MTU_DISCOVER:
 		val = inet->pmtudisc;
 		break;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index b9d84e800cf..3a6e1ec5e9a 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -665,6 +665,13 @@ ic_dhcp_init_options(u8 *options)
 		memcpy(e, ic_req_params, sizeof(ic_req_params));
 		e += sizeof(ic_req_params);
 
+		if (ic_host_name_set) {
+			*e++ = 12;	/* host-name */
+			len = strlen(utsname()->nodename);
+			*e++ = len;
+			memcpy(e, utsname()->nodename, len);
+			e += len;
+		}
 		if (*vendor_class_identifier) {
 			printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n",
 			       vendor_class_identifier);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 7fd63671103..cd300aaee78 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -122,31 +122,59 @@
 
 static int ipip_net_id __read_mostly;
 struct ipip_net {
-	struct ip_tunnel *tunnels_r_l[HASH_SIZE];
-	struct ip_tunnel *tunnels_r[HASH_SIZE];
-	struct ip_tunnel *tunnels_l[HASH_SIZE];
-	struct ip_tunnel *tunnels_wc[1];
-	struct ip_tunnel **tunnels[4];
+	struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_wc[1];
+	struct ip_tunnel __rcu **tunnels[4];
 
 	struct net_device *fb_tunnel_dev;
 };
 
-static void ipip_tunnel_init(struct net_device *dev);
+static int ipip_tunnel_init(struct net_device *dev);
 static void ipip_tunnel_setup(struct net_device *dev);
+static void ipip_dev_free(struct net_device *dev);
 
 /*
- * Locking : hash tables are protected by RCU and a spinlock
+ * Locking : hash tables are protected by RCU and RTNL
  */
-static DEFINE_SPINLOCK(ipip_lock);
 
 #define for_each_ip_tunnel_rcu(start) \
 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
 
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+	unsigned long	rx_packets;
+	unsigned long	rx_bytes;
+	unsigned long	tx_packets;
+	unsigned long	tx_bytes;
+};
+
+static struct net_device_stats *ipip_get_stats(struct net_device *dev)
+{
+	struct pcpu_tstats sum = { 0 };
+	int i;
+
+	for_each_possible_cpu(i) {
+		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+		sum.rx_packets += tstats->rx_packets;
+		sum.rx_bytes   += tstats->rx_bytes;
+		sum.tx_packets += tstats->tx_packets;
+		sum.tx_bytes   += tstats->tx_bytes;
+	}
+	dev->stats.rx_packets = sum.rx_packets;
+	dev->stats.rx_bytes   = sum.rx_bytes;
+	dev->stats.tx_packets = sum.tx_packets;
+	dev->stats.tx_bytes   = sum.tx_bytes;
+	return &dev->stats;
+}
+
 static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
 		__be32 remote, __be32 local)
 {
-	unsigned h0 = HASH(remote);
-	unsigned h1 = HASH(local);
+	unsigned int h0 = HASH(remote);
+	unsigned int h1 = HASH(local);
 	struct ip_tunnel *t;
 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
 
@@ -169,12 +197,12 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
 	return NULL;
 }
 
-static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
+static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
 		struct ip_tunnel_parm *parms)
 {
 	__be32 remote = parms->iph.daddr;
 	__be32 local = parms->iph.saddr;
-	unsigned h = 0;
+	unsigned int h = 0;
 	int prio = 0;
 
 	if (remote) {
@@ -188,7 +216,7 @@ static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
 	return &ipn->tunnels[prio][h];
 }
 
-static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
+static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
 		struct ip_tunnel *t)
 {
 	return __ipip_bucket(ipn, &t->parms);
@@ -196,13 +224,14 @@ static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
 
 static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
 {
-	struct ip_tunnel **tp;
-
-	for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) {
-		if (t == *tp) {
-			spin_lock_bh(&ipip_lock);
-			*tp = t->next;
-			spin_unlock_bh(&ipip_lock);
+	struct ip_tunnel __rcu **tp;
+	struct ip_tunnel *iter;
+
+	for (tp = ipip_bucket(ipn, t);
+	     (iter = rtnl_dereference(*tp)) != NULL;
+	     tp = &iter->next) {
+		if (t == iter) {
+			rcu_assign_pointer(*tp, t->next);
 			break;
 		}
 	}
@@ -210,12 +239,10 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
 
 static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
 {
-	struct ip_tunnel **tp = ipip_bucket(ipn, t);
+	struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
 
-	spin_lock_bh(&ipip_lock);
-	t->next = *tp;
+	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
 	rcu_assign_pointer(*tp, t);
-	spin_unlock_bh(&ipip_lock);
 }
 
 static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
@@ -223,12 +250,15 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
 {
 	__be32 remote = parms->iph.daddr;
 	__be32 local = parms->iph.saddr;
-	struct ip_tunnel *t, **tp, *nt;
+	struct ip_tunnel *t, *nt;
+	struct ip_tunnel __rcu **tp;
 	struct net_device *dev;
 	char name[IFNAMSIZ];
 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
 
-	for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) {
+	for (tp = __ipip_bucket(ipn, parms);
+		 (t = rtnl_dereference(*tp)) != NULL;
+		 tp = &t->next) {
 		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
 			return t;
 	}
@@ -238,7 +268,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
 	if (parms->name[0])
 		strlcpy(name, parms->name, IFNAMSIZ);
 	else
-		sprintf(name, "tunl%%d");
+		strcpy(name, "tunl%d");
 
 	dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
 	if (dev == NULL)
@@ -254,7 +284,8 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
 	nt = netdev_priv(dev);
 	nt->parms = *parms;
 
-	ipip_tunnel_init(dev);
+	if (ipip_tunnel_init(dev) < 0)
+		goto failed_free;
 
 	if (register_netdevice(dev) < 0)
 		goto failed_free;
@@ -264,20 +295,19 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
 	return nt;
 
 failed_free:
-	free_netdev(dev);
+	ipip_dev_free(dev);
 	return NULL;
 }
 
+/* called with RTNL */
 static void ipip_tunnel_uninit(struct net_device *dev)
 {
 	struct net *net = dev_net(dev);
 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
 
-	if (dev == ipn->fb_tunnel_dev) {
-		spin_lock_bh(&ipip_lock);
-		ipn->tunnels_wc[0] = NULL;
-		spin_unlock_bh(&ipip_lock);
-	} else
+	if (dev == ipn->fb_tunnel_dev)
+		rcu_assign_pointer(ipn->tunnels_wc[0], NULL);
+	else
 		ipip_tunnel_unlink(ipn, netdev_priv(dev));
 	dev_put(dev);
 }
@@ -359,8 +389,10 @@ static int ipip_rcv(struct sk_buff *skb)
 	const struct iphdr *iph = ip_hdr(skb);
 
 	rcu_read_lock();
-	if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev),
-					iph->saddr, iph->daddr)) != NULL) {
+	tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
+	if (tunnel != NULL) {
+		struct pcpu_tstats *tstats;
+
 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
 			rcu_read_unlock();
 			kfree_skb(skb);
@@ -374,10 +406,16 @@ static int ipip_rcv(struct sk_buff *skb)
 		skb->protocol = htons(ETH_P_IP);
 		skb->pkt_type = PACKET_HOST;
 
-		skb_tunnel_rx(skb, tunnel->dev);
+		tstats = this_cpu_ptr(tunnel->dev->tstats);
+		tstats->rx_packets++;
+		tstats->rx_bytes += skb->len;
+
+		__skb_tunnel_rx(skb, tunnel->dev);
 
 		ipip_ecn_decapsulate(iph, skb);
+
 		netif_rx(skb);
+
 		rcu_read_unlock();
 		return 0;
 	}
@@ -394,13 +432,12 @@ static int ipip_rcv(struct sk_buff *skb)
 static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
-	struct net_device_stats *stats = &dev->stats;
-	struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+	struct pcpu_tstats *tstats;
 	struct iphdr  *tiph = &tunnel->parms.iph;
 	u8     tos = tunnel->parms.iph.tos;
 	__be16 df = tiph->frag_off;
 	struct rtable *rt;     			/* Route to the other host */
-	struct net_device *tdev;			/* Device to other host */
+	struct net_device *tdev;		/* Device to other host */
 	struct iphdr  *old_iph = ip_hdr(skb);
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
@@ -410,13 +447,13 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (skb->protocol != htons(ETH_P_IP))
 		goto tx_error;
 
-	if (tos&1)
+	if (tos & 1)
 		tos = old_iph->tos;
 
 	if (!dst) {
 		/* NBMA tunnel */
 		if ((rt = skb_rtable(skb)) == NULL) {
-			stats->tx_fifo_errors++;
+			dev->stats.tx_fifo_errors++;
 			goto tx_error;
 		}
 		if ((dst = rt->rt_gateway) == 0)
@@ -424,32 +461,38 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	{
-		struct flowi fl = { .oif = tunnel->parms.link,
-				    .nl_u = { .ip4_u =
-					      { .daddr = dst,
-						.saddr = tiph->saddr,
-						.tos = RT_TOS(tos) } },
-				    .proto = IPPROTO_IPIP };
+		struct flowi fl = {
+			.oif = tunnel->parms.link,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = dst,
+					.saddr = tiph->saddr,
+					.tos = RT_TOS(tos)
+				}
+			},
+			.proto = IPPROTO_IPIP
+		};
+
 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
-			stats->tx_carrier_errors++;
+			dev->stats.tx_carrier_errors++;
 			goto tx_error_icmp;
 		}
 	}
-	tdev = rt->u.dst.dev;
+	tdev = rt->dst.dev;
 
 	if (tdev == dev) {
 		ip_rt_put(rt);
-		stats->collisions++;
+		dev->stats.collisions++;
 		goto tx_error;
 	}
 
 	df |= old_iph->frag_off & htons(IP_DF);
 
 	if (df) {
-		mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
+		mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
 
 		if (mtu < 68) {
-			stats->collisions++;
+			dev->stats.collisions++;
 			ip_rt_put(rt);
 			goto tx_error;
 		}
@@ -485,7 +528,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
 		if (!new_skb) {
 			ip_rt_put(rt);
-			txq->tx_dropped++;
+			dev->stats.tx_dropped++;
 			dev_kfree_skb(skb);
 			return NETDEV_TX_OK;
 		}
@@ -503,7 +546,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 			      IPSKB_REROUTED);
 	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 
 	/*
 	 *	Push down and install the IPIP header.
@@ -522,14 +565,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 		iph->ttl	=	old_iph->ttl;
 
 	nf_reset(skb);
-
-	IPTUNNEL_XMIT();
+	tstats = this_cpu_ptr(dev->tstats);
+	__IPTUNNEL_XMIT(tstats, &dev->stats);
 	return NETDEV_TX_OK;
 
 tx_error_icmp:
 	dst_link_failure(skb);
 tx_error:
-	stats->tx_errors++;
+	dev->stats.tx_errors++;
 	dev_kfree_skb(skb);
 	return NETDEV_TX_OK;
 }
@@ -544,15 +587,21 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
 	iph = &tunnel->parms.iph;
 
 	if (iph->daddr) {
-		struct flowi fl = { .oif = tunnel->parms.link,
-				    .nl_u = { .ip4_u =
-					      { .daddr = iph->daddr,
-						.saddr = iph->saddr,
-						.tos = RT_TOS(iph->tos) } },
-				    .proto = IPPROTO_IPIP };
+		struct flowi fl = {
+			.oif = tunnel->parms.link,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = iph->daddr,
+					.saddr = iph->saddr,
+					.tos = RT_TOS(iph->tos)
+				}
+			},
+			.proto = IPPROTO_IPIP
+		};
 		struct rtable *rt;
+
 		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
-			tdev = rt->u.dst.dev;
+			tdev = rt->dst.dev;
 			ip_rt_put(rt);
 		}
 		dev->flags |= IFF_POINTOPOINT;
@@ -627,6 +676,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 				}
 				t = netdev_priv(dev);
 				ipip_tunnel_unlink(ipn, t);
+				synchronize_net();
 				t->parms.iph.saddr = p.iph.saddr;
 				t->parms.iph.daddr = p.iph.daddr;
 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
@@ -696,13 +746,19 @@ static const struct net_device_ops ipip_netdev_ops = {
 	.ndo_start_xmit	= ipip_tunnel_xmit,
 	.ndo_do_ioctl	= ipip_tunnel_ioctl,
 	.ndo_change_mtu	= ipip_tunnel_change_mtu,
-
+	.ndo_get_stats  = ipip_get_stats,
 };
 
+static void ipip_dev_free(struct net_device *dev)
+{
+	free_percpu(dev->tstats);
+	free_netdev(dev);
+}
+
 static void ipip_tunnel_setup(struct net_device *dev)
 {
 	dev->netdev_ops		= &ipip_netdev_ops;
-	dev->destructor		= free_netdev;
+	dev->destructor		= ipip_dev_free;
 
 	dev->type		= ARPHRD_TUNNEL;
 	dev->hard_header_len 	= LL_MAX_HEADER + sizeof(struct iphdr);
@@ -711,10 +767,11 @@ static void ipip_tunnel_setup(struct net_device *dev)
 	dev->iflink		= 0;
 	dev->addr_len		= 4;
 	dev->features		|= NETIF_F_NETNS_LOCAL;
+	dev->features		|= NETIF_F_LLTX;
 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
 }
 
-static void ipip_tunnel_init(struct net_device *dev)
+static int ipip_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
@@ -725,9 +782,15 @@ static void ipip_tunnel_init(struct net_device *dev)
 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
 
 	ipip_tunnel_bind_dev(dev);
+
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
+	return 0;
 }
 
-static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
+static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	struct iphdr *iph = &tunnel->parms.iph;
@@ -740,11 +803,16 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
 	iph->protocol		= IPPROTO_IPIP;
 	iph->ihl		= 5;
 
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
 	dev_hold(dev);
-	ipn->tunnels_wc[0]	= tunnel;
+	rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
+	return 0;
 }
 
-static struct xfrm_tunnel ipip_handler = {
+static struct xfrm_tunnel ipip_handler __read_mostly = {
 	.handler	=	ipip_rcv,
 	.err_handler	=	ipip_err,
 	.priority	=	1,
@@ -760,11 +828,12 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
 	for (prio = 1; prio < 4; prio++) {
 		int h;
 		for (h = 0; h < HASH_SIZE; h++) {
-			struct ip_tunnel *t = ipn->tunnels[prio][h];
+			struct ip_tunnel *t;
 
+			t = rtnl_dereference(ipn->tunnels[prio][h]);
 			while (t != NULL) {
 				unregister_netdevice_queue(t->dev, head);
-				t = t->next;
+				t = rtnl_dereference(t->next);
 			}
 		}
 	}
@@ -789,7 +858,9 @@ static int __net_init ipip_init_net(struct net *net)
 	}
 	dev_net_set(ipn->fb_tunnel_dev, net);
 
-	ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
+	err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
+	if (err)
+		goto err_reg_dev;
 
 	if ((err = register_netdev(ipn->fb_tunnel_dev)))
 		goto err_reg_dev;
@@ -797,7 +868,7 @@ static int __net_init ipip_init_net(struct net *net)
 	return 0;
 
 err_reg_dev:
-	free_netdev(ipn->fb_tunnel_dev);
+	ipip_dev_free(ipn->fb_tunnel_dev);
 err_alloc_dev:
 	/* nothing */
 	return err;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 7f6273506ee..86dd5691af4 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -75,7 +75,7 @@ struct mr_table {
 	struct net		*net;
 #endif
 	u32			id;
-	struct sock		*mroute_sk;
+	struct sock __rcu	*mroute_sk;
 	struct timer_list	ipmr_expire_timer;
 	struct list_head	mfc_unres_queue;
 	struct list_head	mfc_cache_array[MFC_LINES];
@@ -98,7 +98,7 @@ struct ipmr_result {
 };
 
 /* Big lock, protecting vif table, mrt cache and mroute socket state.
-   Note that the changes are semaphored via rtnl_lock.
+ * Note that the changes are semaphored via rtnl_lock.
  */
 
 static DEFINE_RWLOCK(mrt_lock);
@@ -113,11 +113,11 @@ static DEFINE_RWLOCK(mrt_lock);
 static DEFINE_SPINLOCK(mfc_unres_lock);
 
 /* We return to original Alan's scheme. Hash table of resolved
-   entries is changed only in process context and protected
-   with weak lock mrt_lock. Queue of unresolved entries is protected
-   with strong spinlock mfc_unres_lock.
-
-   In this case data path is free of exclusive locks at all.
+ * entries is changed only in process context and protected
+ * with weak lock mrt_lock. Queue of unresolved entries is protected
+ * with strong spinlock mfc_unres_lock.
+ *
+ * In this case data path is free of exclusive locks at all.
  */
 
 static struct kmem_cache *mrt_cachep __read_mostly;
@@ -396,9 +396,9 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
 			set_fs(KERNEL_DS);
 			err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
 			set_fs(oldfs);
-		} else
+		} else {
 			err = -EOPNOTSUPP;
-
+		}
 		dev = NULL;
 
 		if (err == 0 &&
@@ -495,7 +495,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
 	dev->iflink = 0;
 
 	rcu_read_lock();
-	if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
+	in_dev = __in_dev_get_rcu(dev);
+	if (!in_dev) {
 		rcu_read_unlock();
 		goto failure;
 	}
@@ -552,9 +553,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
 		mrt->mroute_reg_vif_num = -1;
 #endif
 
-	if (vifi+1 == mrt->maxvif) {
+	if (vifi + 1 == mrt->maxvif) {
 		int tmp;
-		for (tmp=vifi-1; tmp>=0; tmp--) {
+
+		for (tmp = vifi - 1; tmp >= 0; tmp--) {
 			if (VIF_EXISTS(mrt, tmp))
 				break;
 		}
@@ -565,25 +567,33 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
 
 	dev_set_allmulti(dev, -1);
 
-	if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
+	in_dev = __in_dev_get_rtnl(dev);
+	if (in_dev) {
 		IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
 		ip_rt_multicast_event(in_dev);
 	}
 
-	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
+	if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
 		unregister_netdevice_queue(dev, head);
 
 	dev_put(dev);
 	return 0;
 }
 
-static inline void ipmr_cache_free(struct mfc_cache *c)
+static void ipmr_cache_free_rcu(struct rcu_head *head)
 {
+	struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);
+
 	kmem_cache_free(mrt_cachep, c);
 }
 
+static inline void ipmr_cache_free(struct mfc_cache *c)
+{
+	call_rcu(&c->rcu, ipmr_cache_free_rcu);
+}
+
 /* Destroy an unresolved cache entry, killing queued skbs
-   and reporting error to netlink readers.
+ * and reporting error to netlink readers.
  */
 
 static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
@@ -605,8 +615,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
 			memset(&e->msg, 0, sizeof(e->msg));
 
 			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
-		} else
+		} else {
 			kfree_skb(skb);
+		}
 	}
 
 	ipmr_cache_free(c);
@@ -724,13 +735,13 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 	case 0:
 		if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
 			dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
-			if (dev && dev->ip_ptr == NULL) {
+			if (dev && __in_dev_get_rtnl(dev) == NULL) {
 				dev_put(dev);
 				return -EADDRNOTAVAIL;
 			}
-		} else
+		} else {
 			dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
-
+		}
 		if (!dev)
 			return -EADDRNOTAVAIL;
 		err = dev_set_allmulti(dev, 1);
@@ -743,16 +754,16 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 		return -EINVAL;
 	}
 
-	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
+	in_dev = __in_dev_get_rtnl(dev);
+	if (!in_dev) {
 		dev_put(dev);
 		return -EADDRNOTAVAIL;
 	}
 	IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
 	ip_rt_multicast_event(in_dev);
 
-	/*
-	 *	Fill in the VIF structures
-	 */
+	/* Fill in the VIF structures */
+
 	v->rate_limit = vifc->vifc_rate_limit;
 	v->local = vifc->vifc_lcl_addr.s_addr;
 	v->remote = vifc->vifc_rmt_addr.s_addr;
@@ -765,14 +776,14 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 	v->pkt_in = 0;
 	v->pkt_out = 0;
 	v->link = dev->ifindex;
-	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
+	if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
 		v->link = dev->iflink;
 
 	/* And finish update writing critical data */
 	write_lock_bh(&mrt_lock);
 	v->dev = dev;
 #ifdef CONFIG_IP_PIMSM
-	if (v->flags&VIFF_REGISTER)
+	if (v->flags & VIFF_REGISTER)
 		mrt->mroute_reg_vif_num = vifi;
 #endif
 	if (vifi+1 > mrt->maxvif)
@@ -781,6 +792,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 	return 0;
 }
 
+/* called with rcu_read_lock() */
 static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 					 __be32 origin,
 					 __be32 mcastgrp)
@@ -788,7 +800,7 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 	int line = MFC_HASH(mcastgrp, origin);
 	struct mfc_cache *c;
 
-	list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
+	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
 		if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
 			return c;
 	}
@@ -801,19 +813,20 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 static struct mfc_cache *ipmr_cache_alloc(void)
 {
 	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
-	if (c == NULL)
-		return NULL;
-	c->mfc_un.res.minvif = MAXVIFS;
+
+	if (c)
+		c->mfc_un.res.minvif = MAXVIFS;
 	return c;
 }
 
 static struct mfc_cache *ipmr_cache_alloc_unres(void)
 {
 	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
-	if (c == NULL)
-		return NULL;
-	skb_queue_head_init(&c->mfc_un.unres.unresolved);
-	c->mfc_un.unres.expires = jiffies + 10*HZ;
+
+	if (c) {
+		skb_queue_head_init(&c->mfc_un.unres.unresolved);
+		c->mfc_un.unres.expires = jiffies + 10*HZ;
+	}
 	return c;
 }
 
@@ -827,17 +840,15 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
 	struct sk_buff *skb;
 	struct nlmsgerr *e;
 
-	/*
-	 *	Play the pending entries through our router
-	 */
+	/* Play the pending entries through our router */
 
 	while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
 		if (ip_hdr(skb)->version == 0) {
 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
 
 			if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
-				nlh->nlmsg_len = (skb_tail_pointer(skb) -
-						  (u8 *)nlh);
+				nlh->nlmsg_len = skb_tail_pointer(skb) -
+						 (u8 *)nlh;
 			} else {
 				nlh->nlmsg_type = NLMSG_ERROR;
 				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
@@ -848,8 +859,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
 			}
 
 			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
-		} else
+		} else {
 			ip_mr_forward(net, mrt, skb, c, 0);
+		}
 	}
 }
 
@@ -867,6 +879,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
 	const int ihl = ip_hdrlen(pkt);
 	struct igmphdr *igmp;
 	struct igmpmsg *msg;
+	struct sock *mroute_sk;
 	int ret;
 
 #ifdef CONFIG_IP_PIMSM
@@ -882,9 +895,9 @@ static int ipmr_cache_report(struct mr_table *mrt,
 #ifdef CONFIG_IP_PIMSM
 	if (assert == IGMPMSG_WHOLEPKT) {
 		/* Ugly, but we have no choice with this interface.
-		   Duplicate old header, fix ihl, length etc.
-		   And all this only to mangle msg->im_msgtype and
-		   to set msg->im_mbz to "mbz" :-)
+		 * Duplicate old header, fix ihl, length etc.
+		 * And all this only to mangle msg->im_msgtype and
+		 * to set msg->im_mbz to "mbz" :-)
 		 */
 		skb_push(skb, sizeof(struct iphdr));
 		skb_reset_network_header(skb);
@@ -901,39 +914,38 @@ static int ipmr_cache_report(struct mr_table *mrt,
 #endif
 	{
 
-	/*
-	 *	Copy the IP header
-	 */
+	/* Copy the IP header */
 
 	skb->network_header = skb->tail;
 	skb_put(skb, ihl);
 	skb_copy_to_linear_data(skb, pkt->data, ihl);
-	ip_hdr(skb)->protocol = 0;			/* Flag to the kernel this is a route add */
+	ip_hdr(skb)->protocol = 0;	/* Flag to the kernel this is a route add */
 	msg = (struct igmpmsg *)skb_network_header(skb);
 	msg->im_vif = vifi;
 	skb_dst_set(skb, dst_clone(skb_dst(pkt)));
 
-	/*
-	 *	Add our header
-	 */
+	/* Add our header */
 
-	igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+	igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
 	igmp->type	=
 	msg->im_msgtype = assert;
-	igmp->code 	=	0;
-	ip_hdr(skb)->tot_len = htons(skb->len);			/* Fix the length */
+	igmp->code	= 0;
+	ip_hdr(skb)->tot_len = htons(skb->len);		/* Fix the length */
 	skb->transport_header = skb->network_header;
 	}
 
-	if (mrt->mroute_sk == NULL) {
+	rcu_read_lock();
+	mroute_sk = rcu_dereference(mrt->mroute_sk);
+	if (mroute_sk == NULL) {
+		rcu_read_unlock();
 		kfree_skb(skb);
 		return -EINVAL;
 	}
 
-	/*
-	 *	Deliver to mrouted
-	 */
-	ret = sock_queue_rcv_skb(mrt->mroute_sk, skb);
+	/* Deliver to mrouted */
+
+	ret = sock_queue_rcv_skb(mroute_sk, skb);
+	rcu_read_unlock();
 	if (ret < 0) {
 		if (net_ratelimit())
 			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
@@ -965,9 +977,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
 	}
 
 	if (!found) {
-		/*
-		 *	Create a new entry if allowable
-		 */
+		/* Create a new entry if allowable */
 
 		if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
 		    (c = ipmr_cache_alloc_unres()) == NULL) {
@@ -977,16 +987,14 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
 			return -ENOBUFS;
 		}
 
-		/*
-		 *	Fill in the new cache entry
-		 */
+		/* Fill in the new cache entry */
+
 		c->mfc_parent	= -1;
 		c->mfc_origin	= iph->saddr;
 		c->mfc_mcastgrp	= iph->daddr;
 
-		/*
-		 *	Reflect first query at mrouted.
-		 */
+		/* Reflect first query at mrouted. */
+
 		err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
 		if (err < 0) {
 			/* If the report failed throw the cache entry
@@ -1006,10 +1014,9 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
 			mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
 	}
 
-	/*
-	 *	See if we can append the packet
-	 */
-	if (c->mfc_un.unres.unresolved.qlen>3) {
+	/* See if we can append the packet */
+
+	if (c->mfc_un.unres.unresolved.qlen > 3) {
 		kfree_skb(skb);
 		err = -ENOBUFS;
 	} else {
@@ -1035,9 +1042,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
 	list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
-			write_lock_bh(&mrt_lock);
-			list_del(&c->list);
-			write_unlock_bh(&mrt_lock);
+			list_del_rcu(&c->list);
 
 			ipmr_cache_free(c);
 			return 0;
@@ -1090,9 +1095,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 	if (!mrtsock)
 		c->mfc_flags |= MFC_STATIC;
 
-	write_lock_bh(&mrt_lock);
-	list_add(&c->list, &mrt->mfc_cache_array[line]);
-	write_unlock_bh(&mrt_lock);
+	list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
 
 	/*
 	 *	Check to see if we resolved a queued list. If so we
@@ -1130,26 +1133,21 @@ static void mroute_clean_tables(struct mr_table *mrt)
 	LIST_HEAD(list);
 	struct mfc_cache *c, *next;
 
-	/*
-	 *	Shut down all active vif entries
-	 */
+	/* Shut down all active vif entries */
+
 	for (i = 0; i < mrt->maxvif; i++) {
-		if (!(mrt->vif_table[i].flags&VIFF_STATIC))
+		if (!(mrt->vif_table[i].flags & VIFF_STATIC))
 			vif_delete(mrt, i, 0, &list);
 	}
 	unregister_netdevice_many(&list);
 
-	/*
-	 *	Wipe the cache
-	 */
+	/* Wipe the cache */
+
 	for (i = 0; i < MFC_LINES; i++) {
 		list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
-			if (c->mfc_flags&MFC_STATIC)
+			if (c->mfc_flags & MFC_STATIC)
 				continue;
-			write_lock_bh(&mrt_lock);
-			list_del(&c->list);
-			write_unlock_bh(&mrt_lock);
-
+			list_del_rcu(&c->list);
 			ipmr_cache_free(c);
 		}
 	}
@@ -1164,6 +1162,9 @@ static void mroute_clean_tables(struct mr_table *mrt)
 	}
 }
 
+/* called from ip_ra_control(), before an RCU grace period,
+ * we dont need to call synchronize_rcu() here
+ */
 static void mrtsock_destruct(struct sock *sk)
 {
 	struct net *net = sock_net(sk);
@@ -1171,13 +1172,9 @@ static void mrtsock_destruct(struct sock *sk)
 
 	rtnl_lock();
 	ipmr_for_each_table(mrt, net) {
-		if (sk == mrt->mroute_sk) {
+		if (sk == rtnl_dereference(mrt->mroute_sk)) {
 			IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
-
-			write_lock_bh(&mrt_lock);
-			mrt->mroute_sk = NULL;
-			write_unlock_bh(&mrt_lock);
-
+			rcu_assign_pointer(mrt->mroute_sk, NULL);
 			mroute_clean_tables(mrt);
 		}
 	}
@@ -1204,7 +1201,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 		return -ENOENT;
 
 	if (optname != MRT_INIT) {
-		if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN))
+		if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
+		    !capable(CAP_NET_ADMIN))
 			return -EACCES;
 	}
 
@@ -1217,23 +1215,20 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 			return -ENOPROTOOPT;
 
 		rtnl_lock();
-		if (mrt->mroute_sk) {
+		if (rtnl_dereference(mrt->mroute_sk)) {
 			rtnl_unlock();
 			return -EADDRINUSE;
 		}
 
 		ret = ip_ra_control(sk, 1, mrtsock_destruct);
 		if (ret == 0) {
-			write_lock_bh(&mrt_lock);
-			mrt->mroute_sk = sk;
-			write_unlock_bh(&mrt_lock);
-
+			rcu_assign_pointer(mrt->mroute_sk, sk);
 			IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
 		}
 		rtnl_unlock();
 		return ret;
 	case MRT_DONE:
-		if (sk != mrt->mroute_sk)
+		if (sk != rcu_dereference_raw(mrt->mroute_sk))
 			return -EACCES;
 		return ip_ra_control(sk, 0, NULL);
 	case MRT_ADD_VIF:
@@ -1246,7 +1241,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 			return -ENFILE;
 		rtnl_lock();
 		if (optname == MRT_ADD_VIF) {
-			ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk);
+			ret = vif_add(net, mrt, &vif,
+				      sk == rtnl_dereference(mrt->mroute_sk));
 		} else {
 			ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
 		}
@@ -1267,7 +1263,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 		if (optname == MRT_DEL_MFC)
 			ret = ipmr_mfc_delete(mrt, &mfc);
 		else
-			ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk);
+			ret = ipmr_mfc_add(net, mrt, &mfc,
+					   sk == rtnl_dereference(mrt->mroute_sk));
 		rtnl_unlock();
 		return ret;
 		/*
@@ -1276,7 +1273,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 	case MRT_ASSERT:
 	{
 		int v;
-		if (get_user(v,(int __user *)optval))
+		if (get_user(v, (int __user *)optval))
 			return -EFAULT;
 		mrt->mroute_do_assert = (v) ? 1 : 0;
 		return 0;
@@ -1286,7 +1283,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 	{
 		int v;
 
-		if (get_user(v,(int __user *)optval))
+		if (get_user(v, (int __user *)optval))
 			return -EFAULT;
 		v = (v) ? 1 : 0;
 
@@ -1309,14 +1306,16 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 			return -EINVAL;
 		if (get_user(v, (u32 __user *)optval))
 			return -EFAULT;
-		if (sk == mrt->mroute_sk)
-			return -EBUSY;
 
 		rtnl_lock();
 		ret = 0;
-		if (!ipmr_new_table(net, v))
-			ret = -ENOMEM;
-		raw_sk(sk)->ipmr_table = v;
+		if (sk == rtnl_dereference(mrt->mroute_sk)) {
+			ret = -EBUSY;
+		} else {
+			if (!ipmr_new_table(net, v))
+				ret = -ENOMEM;
+			raw_sk(sk)->ipmr_table = v;
+		}
 		rtnl_unlock();
 		return ret;
 	}
@@ -1347,9 +1346,9 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
 
 	if (optname != MRT_VERSION &&
 #ifdef CONFIG_IP_PIMSM
-	   optname!=MRT_PIM &&
+	   optname != MRT_PIM &&
 #endif
-	   optname!=MRT_ASSERT)
+	   optname != MRT_ASSERT)
 		return -ENOPROTOOPT;
 
 	if (get_user(olr, optlen))
@@ -1416,19 +1415,19 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
 		if (copy_from_user(&sr, arg, sizeof(sr)))
 			return -EFAULT;
 
-		read_lock(&mrt_lock);
+		rcu_read_lock();
 		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
 		if (c) {
 			sr.pktcnt = c->mfc_un.res.pkt;
 			sr.bytecnt = c->mfc_un.res.bytes;
 			sr.wrong_if = c->mfc_un.res.wrong_if;
-			read_unlock(&mrt_lock);
+			rcu_read_unlock();
 
 			if (copy_to_user(arg, &sr, sizeof(sr)))
 				return -EFAULT;
 			return 0;
 		}
-		read_unlock(&mrt_lock);
+		rcu_read_unlock();
 		return -EADDRNOTAVAIL;
 	default:
 		return -ENOIOCTLCMD;
@@ -1465,7 +1464,7 @@ static struct notifier_block ip_mr_notifier = {
 };
 
 /*
- * 	Encapsulate a packet by attaching a valid IPIP header to it.
+ *	Encapsulate a packet by attaching a valid IPIP header to it.
  *	This avoids tunnel drivers and other mess and gives us the speed so
  *	important for multicast video.
  */
@@ -1480,7 +1479,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 	skb_reset_network_header(skb);
 	iph = ip_hdr(skb);
 
-	iph->version	= 	4;
+	iph->version	=	4;
 	iph->tos	=	old_iph->tos;
 	iph->ttl	=	old_iph->ttl;
 	iph->frag_off	=	0;
@@ -1498,7 +1497,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 
 static inline int ipmr_forward_finish(struct sk_buff *skb)
 {
-	struct ip_options * opt	= &(IPCB(skb)->opt);
+	struct ip_options *opt = &(IPCB(skb)->opt);
 
 	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
 
@@ -1535,32 +1534,44 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
 	}
 #endif
 
-	if (vif->flags&VIFF_TUNNEL) {
-		struct flowi fl = { .oif = vif->link,
-				    .nl_u = { .ip4_u =
-					      { .daddr = vif->remote,
-						.saddr = vif->local,
-						.tos = RT_TOS(iph->tos) } },
-				    .proto = IPPROTO_IPIP };
+	if (vif->flags & VIFF_TUNNEL) {
+		struct flowi fl = {
+			.oif = vif->link,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = vif->remote,
+					.saddr = vif->local,
+					.tos = RT_TOS(iph->tos)
+				}
+			},
+			.proto = IPPROTO_IPIP
+		};
+
 		if (ip_route_output_key(net, &rt, &fl))
 			goto out_free;
 		encap = sizeof(struct iphdr);
 	} else {
-		struct flowi fl = { .oif = vif->link,
-				    .nl_u = { .ip4_u =
-					      { .daddr = iph->daddr,
-						.tos = RT_TOS(iph->tos) } },
-				    .proto = IPPROTO_IPIP };
+		struct flowi fl = {
+			.oif = vif->link,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = iph->daddr,
+					.tos = RT_TOS(iph->tos)
+				}
+			},
+			.proto = IPPROTO_IPIP
+		};
+
 		if (ip_route_output_key(net, &rt, &fl))
 			goto out_free;
 	}
 
-	dev = rt->u.dst.dev;
+	dev = rt->dst.dev;
 
-	if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
+	if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
 		/* Do not fragment multicasts. Alas, IPv4 does not
-		   allow to send ICMP, so that packets will disappear
-		   to blackhole.
+		 * allow to send ICMP, so that packets will disappear
+		 * to blackhole.
 		 */
 
 		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
@@ -1568,7 +1579,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
 		goto out_free;
 	}
 
-	encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
+	encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
 
 	if (skb_cow(skb, encap)) {
 		ip_rt_put(rt);
@@ -1579,11 +1590,12 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
 	vif->bytes_out += skb->len;
 
 	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 	ip_decrease_ttl(ip_hdr(skb));
 
 	/* FIXME: forward and output firewalls used to be called here.
-	 * What do we do with netfilter? -- RR */
+	 * What do we do with netfilter? -- RR
+	 */
 	if (vif->flags & VIFF_TUNNEL) {
 		ip_encap(skb, vif->local, vif->remote);
 		/* FIXME: extra output firewall step used to be here. --RR */
@@ -1644,15 +1656,15 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
 
 		if (skb_rtable(skb)->fl.iif == 0) {
 			/* It is our own packet, looped back.
-			   Very complicated situation...
-
-			   The best workaround until routing daemons will be
-			   fixed is not to redistribute packet, if it was
-			   send through wrong interface. It means, that
-			   multicast applications WILL NOT work for
-			   (S,G), which have default multicast route pointing
-			   to wrong oif. In any case, it is not a good
-			   idea to use multicasting applications on router.
+			 * Very complicated situation...
+			 *
+			 * The best workaround until routing daemons will be
+			 * fixed is not to redistribute packet, if it was
+			 * send through wrong interface. It means, that
+			 * multicast applications WILL NOT work for
+			 * (S,G), which have default multicast route pointing
+			 * to wrong oif. In any case, it is not a good
+			 * idea to use multicasting applications on router.
 			 */
 			goto dont_forward;
 		}
@@ -1662,9 +1674,9 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
 
 		if (true_vifi >= 0 && mrt->mroute_do_assert &&
 		    /* pimsm uses asserts, when switching from RPT to SPT,
-		       so that we cannot check that packet arrived on an oif.
-		       It is bad, but otherwise we would need to move pretty
-		       large chunk of pimd to kernel. Ough... --ANK
+		     * so that we cannot check that packet arrived on an oif.
+		     * It is bad, but otherwise we would need to move pretty
+		     * large chunk of pimd to kernel. Ough... --ANK
 		     */
 		    (mrt->mroute_do_pim ||
 		     cache->mfc_un.res.ttls[true_vifi] < 255) &&
@@ -1682,10 +1694,12 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
 	/*
 	 *	Forward the frame
 	 */
-	for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
+	for (ct = cache->mfc_un.res.maxvif - 1;
+	     ct >= cache->mfc_un.res.minvif; ct--) {
 		if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
 			if (psend != -1) {
 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
 				if (skb2)
 					ipmr_queue_xmit(net, mrt, skb2, cache,
 							psend);
@@ -1696,6 +1710,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
 	if (psend != -1) {
 		if (local) {
 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
 			if (skb2)
 				ipmr_queue_xmit(net, mrt, skb2, cache, psend);
 		} else {
@@ -1713,6 +1728,7 @@ dont_forward:
 
 /*
  *	Multicast packets for forwarding arrive here
+ *	Called with rcu_read_lock();
  */
 
 int ip_mr_input(struct sk_buff *skb)
@@ -1724,9 +1740,9 @@ int ip_mr_input(struct sk_buff *skb)
 	int err;
 
 	/* Packet is looped back after forward, it should not be
-	   forwarded second time, but still can be delivered locally.
+	 * forwarded second time, but still can be delivered locally.
 	 */
-	if (IPCB(skb)->flags&IPSKB_FORWARDED)
+	if (IPCB(skb)->flags & IPSKB_FORWARDED)
 		goto dont_forward;
 
 	err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
@@ -1736,28 +1752,28 @@ int ip_mr_input(struct sk_buff *skb)
 	}
 
 	if (!local) {
-		    if (IPCB(skb)->opt.router_alert) {
-			    if (ip_call_ra_chain(skb))
-				    return 0;
-		    } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
-			    /* IGMPv1 (and broken IGMPv2 implementations sort of
-			       Cisco IOS <= 11.2(8)) do not put router alert
-			       option to IGMP packets destined to routable
-			       groups. It is very bad, because it means
-			       that we can forward NO IGMP messages.
-			     */
-			    read_lock(&mrt_lock);
-			    if (mrt->mroute_sk) {
-				    nf_reset(skb);
-				    raw_rcv(mrt->mroute_sk, skb);
-				    read_unlock(&mrt_lock);
-				    return 0;
-			    }
-			    read_unlock(&mrt_lock);
+		if (IPCB(skb)->opt.router_alert) {
+			if (ip_call_ra_chain(skb))
+				return 0;
+		} else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
+			/* IGMPv1 (and broken IGMPv2 implementations sort of
+			 * Cisco IOS <= 11.2(8)) do not put router alert
+			 * option to IGMP packets destined to routable
+			 * groups. It is very bad, because it means
+			 * that we can forward NO IGMP messages.
+			 */
+			struct sock *mroute_sk;
+
+			mroute_sk = rcu_dereference(mrt->mroute_sk);
+			if (mroute_sk) {
+				nf_reset(skb);
+				raw_rcv(mroute_sk, skb);
+				return 0;
+			}
 		    }
 	}
 
-	read_lock(&mrt_lock);
+	/* already under rcu_read_lock() */
 	cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
 
 	/*
@@ -1769,13 +1785,12 @@ int ip_mr_input(struct sk_buff *skb)
 		if (local) {
 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 			ip_local_deliver(skb);
-			if (skb2 == NULL) {
-				read_unlock(&mrt_lock);
+			if (skb2 == NULL)
 				return -ENOBUFS;
-			}
 			skb = skb2;
 		}
 
+		read_lock(&mrt_lock);
 		vif = ipmr_find_vif(mrt, skb->dev);
 		if (vif >= 0) {
 			int err2 = ipmr_cache_unresolved(mrt, vif, skb);
@@ -1788,8 +1803,8 @@ int ip_mr_input(struct sk_buff *skb)
 		return -ENODEV;
 	}
 
+	read_lock(&mrt_lock);
 	ip_mr_forward(net, mrt, skb, cache, local);
-
 	read_unlock(&mrt_lock);
 
 	if (local)
@@ -1805,6 +1820,7 @@ dont_forward:
 }
 
 #ifdef CONFIG_IP_PIMSM
+/* called with rcu_read_lock() */
 static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
 		     unsigned int pimlen)
 {
@@ -1813,10 +1829,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
 
 	encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
 	/*
-	   Check that:
-	   a. packet is really destinted to a multicast group
-	   b. packet is not a NULL-REGISTER
-	   c. packet is not truncated
+	 * Check that:
+	 * a. packet is really sent to a multicast group
+	 * b. packet is not a NULL-REGISTER
+	 * c. packet is not truncated
 	 */
 	if (!ipv4_is_multicast(encap->daddr) ||
 	    encap->tot_len == 0 ||
@@ -1826,26 +1842,23 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
 	read_lock(&mrt_lock);
 	if (mrt->mroute_reg_vif_num >= 0)
 		reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
-	if (reg_dev)
-		dev_hold(reg_dev);
 	read_unlock(&mrt_lock);
 
 	if (reg_dev == NULL)
 		return 1;
 
 	skb->mac_header = skb->network_header;
-	skb_pull(skb, (u8*)encap - skb->data);
+	skb_pull(skb, (u8 *)encap - skb->data);
 	skb_reset_network_header(skb);
 	skb->protocol = htons(ETH_P_IP);
-	skb->ip_summed = 0;
+	skb->ip_summed = CHECKSUM_NONE;
 	skb->pkt_type = PACKET_HOST;
 
 	skb_tunnel_rx(skb, reg_dev);
 
 	netif_rx(skb);
-	dev_put(reg_dev);
 
-	return 0;
+	return NET_RX_SUCCESS;
 }
 #endif
 
@@ -1854,7 +1867,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
  * Handle IGMP messages of PIMv1
  */
 
-int pim_rcv_v1(struct sk_buff * skb)
+int pim_rcv_v1(struct sk_buff *skb)
 {
 	struct igmphdr *pim;
 	struct net *net = dev_net(skb->dev);
@@ -1881,7 +1894,7 @@ drop:
 #endif
 
 #ifdef CONFIG_IP_PIMSM_V2
-static int pim_rcv(struct sk_buff * skb)
+static int pim_rcv(struct sk_buff *skb)
 {
 	struct pimreghdr *pim;
 	struct net *net = dev_net(skb->dev);
@@ -1891,8 +1904,8 @@ static int pim_rcv(struct sk_buff * skb)
 		goto drop;
 
 	pim = (struct pimreghdr *)skb_transport_header(skb);
-	if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
-	    (pim->flags&PIM_NULL_REGISTER) ||
+	if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
+	    (pim->flags & PIM_NULL_REGISTER) ||
 	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
 	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
 		goto drop;
@@ -1958,28 +1971,33 @@ int ipmr_get_route(struct net *net,
 	if (mrt == NULL)
 		return -ENOENT;
 
-	read_lock(&mrt_lock);
+	rcu_read_lock();
 	cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
 
 	if (cache == NULL) {
 		struct sk_buff *skb2;
 		struct iphdr *iph;
 		struct net_device *dev;
-		int vif;
+		int vif = -1;
 
 		if (nowait) {
-			read_unlock(&mrt_lock);
+			rcu_read_unlock();
 			return -EAGAIN;
 		}
 
 		dev = skb->dev;
-		if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
+		read_lock(&mrt_lock);
+		if (dev)
+			vif = ipmr_find_vif(mrt, dev);
+		if (vif < 0) {
 			read_unlock(&mrt_lock);
+			rcu_read_unlock();
 			return -ENODEV;
 		}
 		skb2 = skb_clone(skb, GFP_ATOMIC);
 		if (!skb2) {
 			read_unlock(&mrt_lock);
+			rcu_read_unlock();
 			return -ENOMEM;
 		}
 
@@ -1992,13 +2010,16 @@ int ipmr_get_route(struct net *net,
 		iph->version = 0;
 		err = ipmr_cache_unresolved(mrt, vif, skb2);
 		read_unlock(&mrt_lock);
+		rcu_read_unlock();
 		return err;
 	}
 
-	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
+	read_lock(&mrt_lock);
+	if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
 		cache->mfc_flags |= MFC_NOTIFY;
 	err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
 	read_unlock(&mrt_lock);
+	rcu_read_unlock();
 	return err;
 }
 
@@ -2050,14 +2071,14 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 	s_h = cb->args[1];
 	s_e = cb->args[2];
 
-	read_lock(&mrt_lock);
+	rcu_read_lock();
 	ipmr_for_each_table(mrt, net) {
 		if (t < s_t)
 			goto next_table;
 		if (t > s_t)
 			s_h = 0;
 		for (h = s_h; h < MFC_LINES; h++) {
-			list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) {
+			list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
 				if (e < s_e)
 					goto next_entry;
 				if (ipmr_fill_mroute(mrt, skb,
@@ -2075,7 +2096,7 @@ next_table:
 		t++;
 	}
 done:
-	read_unlock(&mrt_lock);
+	rcu_read_unlock();
 
 	cb->args[2] = e;
 	cb->args[1] = h;
@@ -2086,7 +2107,8 @@ done:
 
 #ifdef CONFIG_PROC_FS
 /*
- *	The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
+ *	The /proc interfaces to multicast routing :
+ *	/proc/net/ip_mr_cache & /proc/net/ip_mr_vif
  */
 struct ipmr_vif_iter {
 	struct seq_net_private p;
@@ -2208,14 +2230,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
 	struct mr_table *mrt = it->mrt;
 	struct mfc_cache *mfc;
 
-	read_lock(&mrt_lock);
+	rcu_read_lock();
 	for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
 		it->cache = &mrt->mfc_cache_array[it->ct];
-		list_for_each_entry(mfc, it->cache, list)
+		list_for_each_entry_rcu(mfc, it->cache, list)
 			if (pos-- == 0)
 				return mfc;
 	}
-	read_unlock(&mrt_lock);
+	rcu_read_unlock();
 
 	spin_lock_bh(&mfc_unres_lock);
 	it->cache = &mrt->mfc_unres_queue;
@@ -2274,7 +2296,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	}
 
 	/* exhausted cache_array, show unresolved */
-	read_unlock(&mrt_lock);
+	rcu_read_unlock();
 	it->cache = &mrt->mfc_unres_queue;
 	it->ct = 0;
 
@@ -2282,7 +2304,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (!list_empty(it->cache))
 		return list_first_entry(it->cache, struct mfc_cache, list);
 
- end_of_list:
+end_of_list:
 	spin_unlock_bh(&mfc_unres_lock);
 	it->cache = NULL;
 
@@ -2297,7 +2319,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
 	if (it->cache == &mrt->mfc_unres_queue)
 		spin_unlock_bh(&mfc_unres_lock);
 	else if (it->cache == &mrt->mfc_cache_array[it->ct])
-		read_unlock(&mrt_lock);
+		rcu_read_unlock();
 }
 
 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
@@ -2323,7 +2345,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 				   mfc->mfc_un.res.bytes,
 				   mfc->mfc_un.res.wrong_if);
 			for (n = mfc->mfc_un.res.minvif;
-			     n < mfc->mfc_un.res.maxvif; n++ ) {
+			     n < mfc->mfc_un.res.maxvif; n++) {
 				if (VIF_EXISTS(mrt, n) &&
 				    mfc->mfc_un.res.ttls[n] < 255)
 					seq_printf(seq,
@@ -2421,7 +2443,7 @@ int __init ip_mr_init(void)
 
 	mrt_cachep = kmem_cache_create("ip_mrt_cache",
 				       sizeof(struct mfc_cache),
-				       0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+				       0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
 				       NULL);
 	if (!mrt_cachep)
 		return -ENOMEM;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 07de855e217..d88a46c54fd 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -43,7 +43,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 
 		/* Drop old route. */
 		skb_dst_drop(skb);
-		skb_dst_set(skb, &rt->u.dst);
+		skb_dst_set(skb, &rt->dst);
 	} else {
 		/* non-local src, find valid iif to satisfy
 		 * rp-filter when calling ip_route_input. */
@@ -53,11 +53,11 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 
 		orefdst = skb->_skb_refdst;
 		if (ip_route_input(skb, iph->daddr, iph->saddr,
-				   RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
-			dst_release(&rt->u.dst);
+				   RT_TOS(iph->tos), rt->dst.dev) != 0) {
+			dst_release(&rt->dst);
 			return -1;
 		}
-		dst_release(&rt->u.dst);
+		dst_release(&rt->dst);
 		refdst_drop(orefdst);
 	}
 
@@ -212,9 +212,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
 		skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
 					       skb->len - dataoff, 0);
 		skb->ip_summed = CHECKSUM_NONE;
-		csum = __skb_checksum_complete_head(skb, dataoff + len);
-		if (!csum)
-			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		return __skb_checksum_complete_head(skb, dataoff + len);
 	}
 	return csum;
 }
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 1833bdbf980..babd1a2bae5 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -147,7 +147,7 @@ config IP_NF_TARGET_ULOG
 	  which can only be viewed through syslog.
 
 	  The appropriate userspace logging daemon (ulogd) may be obtained from
-	  <http://www.gnumonks.org/projects/ulogd/>
+	  <http://www.netfilter.org/projects/ulogd/index.html>
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
@@ -324,10 +324,10 @@ config IP_NF_TARGET_ECN
 
 config IP_NF_TARGET_TTL
 	tristate '"TTL" target support'
-	depends on NETFILTER_ADVANCED
+	depends on NETFILTER_ADVANCED && IP_NF_MANGLE
 	select NETFILTER_XT_TARGET_HL
 	---help---
-	This is a backwards-compat option for the user's convenience
+	This is a backwards-compatible option for the user's convenience
 	(e.g. when running oldconfig). It selects
 	CONFIG_NETFILTER_XT_TARGET_HL.
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 1ac01b12862..3cad2591ace 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -72,7 +72,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
 	for (i = 0; i < len; i++)
 		ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
 
-	return (ret != 0);
+	return ret != 0;
 }
 
 /*
@@ -228,7 +228,7 @@ arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
 	return NF_DROP;
 }
 
-static inline const struct arpt_entry_target *
+static inline const struct xt_entry_target *
 arpt_get_target_c(const struct arpt_entry *e)
 {
 	return arpt_get_target((struct arpt_entry *)e);
@@ -282,17 +282,14 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 
 	arp = arp_hdr(skb);
 	do {
-		const struct arpt_entry_target *t;
-		int hdr_len;
+		const struct xt_entry_target *t;
 
 		if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
 			e = arpt_next_entry(e);
 			continue;
 		}
 
-		hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
-			(2 * skb->dev->addr_len);
-		ADD_COUNTER(e->counters, hdr_len, 1);
+		ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1);
 
 		t = arpt_get_target_c(e);
 
@@ -300,10 +297,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 		if (!t->u.kernel.target->target) {
 			int v;
 
-			v = ((struct arpt_standard_target *)t)->verdict;
+			v = ((struct xt_standard_target *)t)->verdict;
 			if (v < 0) {
 				/* Pop from stack? */
-				if (v != ARPT_RETURN) {
+				if (v != XT_RETURN) {
 					verdict = (unsigned)(-v) - 1;
 					break;
 				}
@@ -335,7 +332,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 		/* Target might have changed stuff. */
 		arp = arp_hdr(skb);
 
-		if (verdict == ARPT_CONTINUE)
+		if (verdict == XT_CONTINUE)
 			e = arpt_next_entry(e);
 		else
 			/* Verdict */
@@ -380,7 +377,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 		e->counters.pcnt = pos;
 
 		for (;;) {
-			const struct arpt_standard_target *t
+			const struct xt_standard_target *t
 				= (void *)arpt_get_target_c(e);
 			int visited = e->comefrom & (1 << hook);
 
@@ -395,13 +392,13 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 			/* Unconditional return/END. */
 			if ((e->target_offset == sizeof(struct arpt_entry) &&
 			     (strcmp(t->target.u.user.name,
-				     ARPT_STANDARD_TARGET) == 0) &&
+				     XT_STANDARD_TARGET) == 0) &&
 			     t->verdict < 0 && unconditional(&e->arp)) ||
 			    visited) {
 				unsigned int oldpos, size;
 
 				if ((strcmp(t->target.u.user.name,
-					    ARPT_STANDARD_TARGET) == 0) &&
+					    XT_STANDARD_TARGET) == 0) &&
 				    t->verdict < -NF_MAX_VERDICT - 1) {
 					duprintf("mark_source_chains: bad "
 						"negative verdict (%i)\n",
@@ -436,7 +433,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 				int newpos = t->verdict;
 
 				if (strcmp(t->target.u.user.name,
-					   ARPT_STANDARD_TARGET) == 0 &&
+					   XT_STANDARD_TARGET) == 0 &&
 				    newpos >= 0) {
 					if (newpos > newinfo->size -
 						sizeof(struct arpt_entry)) {
@@ -467,14 +464,14 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 
 static inline int check_entry(const struct arpt_entry *e, const char *name)
 {
-	const struct arpt_entry_target *t;
+	const struct xt_entry_target *t;
 
 	if (!arp_checkentry(&e->arp)) {
 		duprintf("arp_tables: arp check failed %p %s.\n", e, name);
 		return -EINVAL;
 	}
 
-	if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset)
+	if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
 		return -EINVAL;
 
 	t = arpt_get_target_c(e);
@@ -486,7 +483,7 @@ static inline int check_entry(const struct arpt_entry *e, const char *name)
 
 static inline int check_target(struct arpt_entry *e, const char *name)
 {
-	struct arpt_entry_target *t = arpt_get_target(e);
+	struct xt_entry_target *t = arpt_get_target(e);
 	int ret;
 	struct xt_tgchk_param par = {
 		.table     = name,
@@ -509,7 +506,7 @@ static inline int check_target(struct arpt_entry *e, const char *name)
 static inline int
 find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
 {
-	struct arpt_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	int ret;
 
@@ -539,7 +536,7 @@ out:
 
 static bool check_underflow(const struct arpt_entry *e)
 {
-	const struct arpt_entry_target *t;
+	const struct xt_entry_target *t;
 	unsigned int verdict;
 
 	if (!unconditional(&e->arp))
@@ -547,7 +544,7 @@ static bool check_underflow(const struct arpt_entry *e)
 	t = arpt_get_target_c(e);
 	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
 		return false;
-	verdict = ((struct arpt_standard_target *)t)->verdict;
+	verdict = ((struct xt_standard_target *)t)->verdict;
 	verdict = -verdict - 1;
 	return verdict == NF_DROP || verdict == NF_ACCEPT;
 }
@@ -569,7 +566,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 	}
 
 	if (e->next_offset
-	    < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) {
+	    < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) {
 		duprintf("checking: element %p size %u\n",
 			 e, e->next_offset);
 		return -EINVAL;
@@ -601,7 +598,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 static inline void cleanup_entry(struct arpt_entry *e)
 {
 	struct xt_tgdtor_param par;
-	struct arpt_entry_target *t;
+	struct xt_entry_target *t;
 
 	t = arpt_get_target(e);
 	par.target   = t->u.kernel.target;
@@ -713,7 +710,7 @@ static void get_counters(const struct xt_table_info *t,
 	struct arpt_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu;
+	unsigned int curcpu = get_cpu();
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
@@ -723,19 +720,22 @@ static void get_counters(const struct xt_table_info *t,
 	 * if new softirq were to run and call ipt_do_table
 	 */
 	local_bh_disable();
-	curcpu = smp_processor_id();
-
 	i = 0;
 	xt_entry_foreach(iter, t->entries[curcpu], t->size) {
 		SET_COUNTER(counters[i], iter->counters.bcnt,
 			    iter->counters.pcnt);
 		++i;
 	}
+	local_bh_enable();
+	/* Processing counters from other cpus, we can let bottom half enabled,
+	 * (preemption is disabled)
+	 */
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		local_bh_disable();
 		xt_info_wrlock(cpu);
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
 			ADD_COUNTER(counters[i], iter->counters.bcnt,
@@ -743,8 +743,9 @@ static void get_counters(const struct xt_table_info *t,
 			++i;
 		}
 		xt_info_wrunlock(cpu);
+		local_bh_enable();
 	}
-	local_bh_enable();
+	put_cpu();
 }
 
 static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -758,7 +759,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 	 * about).
 	 */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vmalloc_node(countersize, numa_node_id());
+	counters = vmalloc(countersize);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -793,7 +794,7 @@ static int copy_entries_to_user(unsigned int total_size,
 	/* FIXME: use iterator macros --RR */
 	/* ... then go back and fix counters and names */
 	for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
-		const struct arpt_entry_target *t;
+		const struct xt_entry_target *t;
 
 		e = (struct arpt_entry *)(loc_cpu_entry + off);
 		if (copy_to_user(userptr + off
@@ -806,7 +807,7 @@ static int copy_entries_to_user(unsigned int total_size,
 
 		t = arpt_get_target_c(e);
 		if (copy_to_user(userptr + off + e->target_offset
-				 + offsetof(struct arpt_entry_target,
+				 + offsetof(struct xt_entry_target,
 					    u.user.name),
 				 t->u.kernel.target->name,
 				 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -843,7 +844,7 @@ static int compat_calc_entry(const struct arpt_entry *e,
 			     const struct xt_table_info *info,
 			     const void *base, struct xt_table_info *newinfo)
 {
-	const struct arpt_entry_target *t;
+	const struct xt_entry_target *t;
 	unsigned int entry_offset;
 	int off, i, ret;
 
@@ -894,7 +895,7 @@ static int compat_table_info(const struct xt_table_info *info,
 static int get_info(struct net *net, void __user *user,
                     const int *len, int compat)
 {
-	char name[ARPT_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 	struct xt_table *t;
 	int ret;
 
@@ -907,7 +908,7 @@ static int get_info(struct net *net, void __user *user,
 	if (copy_from_user(name, user, sizeof(name)) != 0)
 		return -EFAULT;
 
-	name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
+	name[XT_TABLE_MAXNAMELEN-1] = '\0';
 #ifdef CONFIG_COMPAT
 	if (compat)
 		xt_compat_lock(NFPROTO_ARP);
@@ -1005,8 +1006,7 @@ static int __do_replace(struct net *net, const char *name,
 	struct arpt_entry *iter;
 
 	ret = 0;
-	counters = vmalloc_node(num_counters * sizeof(struct xt_counters),
-				numa_node_id());
+	counters = vmalloc(num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;
@@ -1159,7 +1159,7 @@ static int do_add_counters(struct net *net, const void __user *user,
 	if (len != size + num_counters * sizeof(struct xt_counters))
 		return -EINVAL;
 
-	paddc = vmalloc_node(len - size, numa_node_id());
+	paddc = vmalloc(len - size);
 	if (!paddc)
 		return -ENOMEM;
 
@@ -1204,7 +1204,7 @@ static int do_add_counters(struct net *net, const void __user *user,
 #ifdef CONFIG_COMPAT
 static inline void compat_release_entry(struct compat_arpt_entry *e)
 {
-	struct arpt_entry_target *t;
+	struct xt_entry_target *t;
 
 	t = compat_arpt_get_target(e);
 	module_put(t->u.kernel.target->me);
@@ -1220,7 +1220,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 				  const unsigned int *underflows,
 				  const char *name)
 {
-	struct arpt_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	unsigned int entry_offset;
 	int ret, off, h;
@@ -1288,7 +1288,7 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
 			    unsigned int *size, const char *name,
 			    struct xt_table_info *newinfo, unsigned char *base)
 {
-	struct arpt_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	struct arpt_entry *de;
 	unsigned int origsize;
@@ -1420,6 +1420,9 @@ static int translate_compat_table(const char *name,
 		if (ret != 0)
 			break;
 		++i;
+		if (strcmp(arpt_get_target(iter1)->u.user.name,
+		    XT_ERROR_TARGET) == 0)
+			++newinfo->stacksize;
 	}
 	if (ret) {
 		/*
@@ -1471,7 +1474,7 @@ out_unlock:
 }
 
 struct compat_arpt_replace {
-	char				name[ARPT_TABLE_MAXNAMELEN];
+	char				name[XT_TABLE_MAXNAMELEN];
 	u32				valid_hooks;
 	u32				num_entries;
 	u32				size;
@@ -1564,7 +1567,7 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
 				     struct xt_counters *counters,
 				     unsigned int i)
 {
-	struct arpt_entry_target *t;
+	struct xt_entry_target *t;
 	struct compat_arpt_entry __user *ce;
 	u_int16_t target_offset, next_offset;
 	compat_uint_t origsize;
@@ -1625,7 +1628,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
 }
 
 struct compat_arpt_get_entries {
-	char name[ARPT_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 	compat_uint_t size;
 	struct compat_arpt_entry entrytable[0];
 };
@@ -1825,7 +1828,7 @@ void arpt_unregister_table(struct xt_table *table)
 /* The built-in targets: standard (NULL) and error. */
 static struct xt_target arpt_builtin_tg[] __read_mostly = {
 	{
-		.name             = ARPT_STANDARD_TARGET,
+		.name             = XT_STANDARD_TARGET,
 		.targetsize       = sizeof(int),
 		.family           = NFPROTO_ARP,
 #ifdef CONFIG_COMPAT
@@ -1835,9 +1838,9 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = {
 #endif
 	},
 	{
-		.name             = ARPT_ERROR_TARGET,
+		.name             = XT_ERROR_TARGET,
 		.target           = arpt_error,
-		.targetsize       = ARPT_FUNCTION_MAXNAMELEN,
+		.targetsize       = XT_FUNCTION_MAXNAMELEN,
 		.family           = NFPROTO_ARP,
 	},
 };
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index e1be7dd1171..b8ddcc480ed 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -63,7 +63,7 @@ static int checkentry(const struct xt_tgchk_param *par)
 		return false;
 
 	if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
-	   mangle->target != ARPT_CONTINUE)
+	   mangle->target != XT_CONTINUE)
 		return false;
 	return true;
 }
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index a4e5fc5df4b..d2c1311cb28 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -42,7 +42,7 @@ typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long);
 
 static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
 static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
-static DEFINE_RWLOCK(queue_lock);
+static DEFINE_SPINLOCK(queue_lock);
 static int peer_pid __read_mostly;
 static unsigned int copy_range __read_mostly;
 static unsigned int queue_total;
@@ -72,10 +72,10 @@ __ipq_set_mode(unsigned char mode, unsigned int range)
 		break;
 
 	case IPQ_COPY_PACKET:
-		copy_mode = mode;
+		if (range > 0xFFFF)
+			range = 0xFFFF;
 		copy_range = range;
-		if (copy_range > 0xFFFF)
-			copy_range = 0xFFFF;
+		copy_mode = mode;
 		break;
 
 	default:
@@ -101,7 +101,7 @@ ipq_find_dequeue_entry(unsigned long id)
 {
 	struct nf_queue_entry *entry = NULL, *i;
 
-	write_lock_bh(&queue_lock);
+	spin_lock_bh(&queue_lock);
 
 	list_for_each_entry(i, &queue_list, list) {
 		if ((unsigned long)i == id) {
@@ -115,7 +115,7 @@ ipq_find_dequeue_entry(unsigned long id)
 		queue_total--;
 	}
 
-	write_unlock_bh(&queue_lock);
+	spin_unlock_bh(&queue_lock);
 	return entry;
 }
 
@@ -136,9 +136,9 @@ __ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
 static void
 ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
 {
-	write_lock_bh(&queue_lock);
+	spin_lock_bh(&queue_lock);
 	__ipq_flush(cmpfn, data);
-	write_unlock_bh(&queue_lock);
+	spin_unlock_bh(&queue_lock);
 }
 
 static struct sk_buff *
@@ -152,9 +152,7 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
 	struct nlmsghdr *nlh;
 	struct timeval tv;
 
-	read_lock_bh(&queue_lock);
-
-	switch (copy_mode) {
+	switch (ACCESS_ONCE(copy_mode)) {
 	case IPQ_COPY_META:
 	case IPQ_COPY_NONE:
 		size = NLMSG_SPACE(sizeof(*pmsg));
@@ -162,26 +160,21 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
 
 	case IPQ_COPY_PACKET:
 		if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
-		    (*errp = skb_checksum_help(entry->skb))) {
-			read_unlock_bh(&queue_lock);
+		    (*errp = skb_checksum_help(entry->skb)))
 			return NULL;
-		}
-		if (copy_range == 0 || copy_range > entry->skb->len)
+
+		data_len = ACCESS_ONCE(copy_range);
+		if (data_len == 0 || data_len > entry->skb->len)
 			data_len = entry->skb->len;
-		else
-			data_len = copy_range;
 
 		size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
 		break;
 
 	default:
 		*errp = -EINVAL;
-		read_unlock_bh(&queue_lock);
 		return NULL;
 	}
 
-	read_unlock_bh(&queue_lock);
-
 	skb = alloc_skb(size, GFP_ATOMIC);
 	if (!skb)
 		goto nlmsg_failure;
@@ -242,7 +235,7 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
 	if (nskb == NULL)
 		return status;
 
-	write_lock_bh(&queue_lock);
+	spin_lock_bh(&queue_lock);
 
 	if (!peer_pid)
 		goto err_out_free_nskb;
@@ -266,14 +259,14 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
 
 	__ipq_enqueue_entry(entry);
 
-	write_unlock_bh(&queue_lock);
+	spin_unlock_bh(&queue_lock);
 	return status;
 
 err_out_free_nskb:
 	kfree_skb(nskb);
 
 err_out_unlock:
-	write_unlock_bh(&queue_lock);
+	spin_unlock_bh(&queue_lock);
 	return status;
 }
 
@@ -342,9 +335,9 @@ ipq_set_mode(unsigned char mode, unsigned int range)
 {
 	int status;
 
-	write_lock_bh(&queue_lock);
+	spin_lock_bh(&queue_lock);
 	status = __ipq_set_mode(mode, range);
-	write_unlock_bh(&queue_lock);
+	spin_unlock_bh(&queue_lock);
 	return status;
 }
 
@@ -440,11 +433,11 @@ __ipq_rcv_skb(struct sk_buff *skb)
 	if (security_netlink_recv(skb, CAP_NET_ADMIN))
 		RCV_SKB_FAIL(-EPERM);
 
-	write_lock_bh(&queue_lock);
+	spin_lock_bh(&queue_lock);
 
 	if (peer_pid) {
 		if (peer_pid != pid) {
-			write_unlock_bh(&queue_lock);
+			spin_unlock_bh(&queue_lock);
 			RCV_SKB_FAIL(-EBUSY);
 		}
 	} else {
@@ -452,7 +445,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
 		peer_pid = pid;
 	}
 
-	write_unlock_bh(&queue_lock);
+	spin_unlock_bh(&queue_lock);
 
 	status = ipq_receive_peer(NLMSG_DATA(nlh), type,
 				  nlmsglen - NLMSG_LENGTH(0));
@@ -497,10 +490,10 @@ ipq_rcv_nl_event(struct notifier_block *this,
 	struct netlink_notify *n = ptr;
 
 	if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) {
-		write_lock_bh(&queue_lock);
+		spin_lock_bh(&queue_lock);
 		if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
 			__ipq_reset();
-		write_unlock_bh(&queue_lock);
+		spin_unlock_bh(&queue_lock);
 	}
 	return NOTIFY_DONE;
 }
@@ -527,7 +520,7 @@ static ctl_table ipq_table[] = {
 #ifdef CONFIG_PROC_FS
 static int ip_queue_show(struct seq_file *m, void *v)
 {
-	read_lock_bh(&queue_lock);
+	spin_lock_bh(&queue_lock);
 
 	seq_printf(m,
 		      "Peer PID          : %d\n"
@@ -545,7 +538,7 @@ static int ip_queue_show(struct seq_file *m, void *v)
 		      queue_dropped,
 		      queue_user_dropped);
 
-	read_unlock_bh(&queue_lock);
+	spin_unlock_bh(&queue_lock);
 	return 0;
 }
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 4b6c5ca610f..d31b007a6d8 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -186,7 +186,7 @@ static inline bool unconditional(const struct ipt_ip *ip)
 }
 
 /* for const-correctness */
-static inline const struct ipt_entry_target *
+static inline const struct xt_entry_target *
 ipt_get_target_c(const struct ipt_entry *e)
 {
 	return ipt_get_target((struct ipt_entry *)e);
@@ -230,9 +230,9 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
 		      const char *hookname, const char **chainname,
 		      const char **comment, unsigned int *rulenum)
 {
-	const struct ipt_standard_target *t = (void *)ipt_get_target_c(s);
+	const struct xt_standard_target *t = (void *)ipt_get_target_c(s);
 
-	if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) {
+	if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
 		/* Head of user chain: ERROR target with chainname */
 		*chainname = t->target.data;
 		(*rulenum) = 0;
@@ -241,7 +241,7 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
 
 		if (s->target_offset == sizeof(struct ipt_entry) &&
 		    strcmp(t->target.u.kernel.target->name,
-			   IPT_STANDARD_TARGET) == 0 &&
+			   XT_STANDARD_TARGET) == 0 &&
 		   t->verdict < 0 &&
 		   unconditional(&s->ip)) {
 			/* Tail of chains: STANDARD target (return/policy) */
@@ -346,7 +346,7 @@ ipt_do_table(struct sk_buff *skb,
 		 get_entry(table_base, private->underflow[hook]));
 
 	do {
-		const struct ipt_entry_target *t;
+		const struct xt_entry_target *t;
 		const struct xt_entry_match *ematch;
 
 		IP_NF_ASSERT(e);
@@ -364,7 +364,7 @@ ipt_do_table(struct sk_buff *skb,
 				goto no_match;
 		}
 
-		ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
+		ADD_COUNTER(e->counters, skb->len, 1);
 
 		t = ipt_get_target(e);
 		IP_NF_ASSERT(t->u.kernel.target);
@@ -380,10 +380,10 @@ ipt_do_table(struct sk_buff *skb,
 		if (!t->u.kernel.target->target) {
 			int v;
 
-			v = ((struct ipt_standard_target *)t)->verdict;
+			v = ((struct xt_standard_target *)t)->verdict;
 			if (v < 0) {
 				/* Pop from stack? */
-				if (v != IPT_RETURN) {
+				if (v != XT_RETURN) {
 					verdict = (unsigned)(-v) - 1;
 					break;
 				}
@@ -421,7 +421,7 @@ ipt_do_table(struct sk_buff *skb,
 		verdict = t->u.kernel.target->target(skb, &acpar);
 		/* Target might have changed stuff. */
 		ip = ip_hdr(skb);
-		if (verdict == IPT_CONTINUE)
+		if (verdict == XT_CONTINUE)
 			e = ipt_next_entry(e);
 		else
 			/* Verdict */
@@ -461,7 +461,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
 		e->counters.pcnt = pos;
 
 		for (;;) {
-			const struct ipt_standard_target *t
+			const struct xt_standard_target *t
 				= (void *)ipt_get_target_c(e);
 			int visited = e->comefrom & (1 << hook);
 
@@ -475,13 +475,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
 			/* Unconditional return/END. */
 			if ((e->target_offset == sizeof(struct ipt_entry) &&
 			     (strcmp(t->target.u.user.name,
-				     IPT_STANDARD_TARGET) == 0) &&
+				     XT_STANDARD_TARGET) == 0) &&
 			     t->verdict < 0 && unconditional(&e->ip)) ||
 			    visited) {
 				unsigned int oldpos, size;
 
 				if ((strcmp(t->target.u.user.name,
-			    		    IPT_STANDARD_TARGET) == 0) &&
+			    		    XT_STANDARD_TARGET) == 0) &&
 				    t->verdict < -NF_MAX_VERDICT - 1) {
 					duprintf("mark_source_chains: bad "
 						"negative verdict (%i)\n",
@@ -524,7 +524,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
 				int newpos = t->verdict;
 
 				if (strcmp(t->target.u.user.name,
-					   IPT_STANDARD_TARGET) == 0 &&
+					   XT_STANDARD_TARGET) == 0 &&
 				    newpos >= 0) {
 					if (newpos > newinfo->size -
 						sizeof(struct ipt_entry)) {
@@ -552,7 +552,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
 	return 1;
 }
 
-static void cleanup_match(struct ipt_entry_match *m, struct net *net)
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
 {
 	struct xt_mtdtor_param par;
 
@@ -568,14 +568,14 @@ static void cleanup_match(struct ipt_entry_match *m, struct net *net)
 static int
 check_entry(const struct ipt_entry *e, const char *name)
 {
-	const struct ipt_entry_target *t;
+	const struct xt_entry_target *t;
 
 	if (!ip_checkentry(&e->ip)) {
 		duprintf("ip check failed %p %s.\n", e, par->match->name);
 		return -EINVAL;
 	}
 
-	if (e->target_offset + sizeof(struct ipt_entry_target) >
+	if (e->target_offset + sizeof(struct xt_entry_target) >
 	    e->next_offset)
 		return -EINVAL;
 
@@ -587,7 +587,7 @@ check_entry(const struct ipt_entry *e, const char *name)
 }
 
 static int
-check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
+check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
 {
 	const struct ipt_ip *ip = par->entryinfo;
 	int ret;
@@ -605,7 +605,7 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
 }
 
 static int
-find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
 {
 	struct xt_match *match;
 	int ret;
@@ -630,7 +630,7 @@ err:
 
 static int check_target(struct ipt_entry *e, struct net *net, const char *name)
 {
-	struct ipt_entry_target *t = ipt_get_target(e);
+	struct xt_entry_target *t = ipt_get_target(e);
 	struct xt_tgchk_param par = {
 		.net       = net,
 		.table     = name,
@@ -656,7 +656,7 @@ static int
 find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 		 unsigned int size)
 {
-	struct ipt_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	int ret;
 	unsigned int j;
@@ -707,7 +707,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 
 static bool check_underflow(const struct ipt_entry *e)
 {
-	const struct ipt_entry_target *t;
+	const struct xt_entry_target *t;
 	unsigned int verdict;
 
 	if (!unconditional(&e->ip))
@@ -715,7 +715,7 @@ static bool check_underflow(const struct ipt_entry *e)
 	t = ipt_get_target_c(e);
 	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
 		return false;
-	verdict = ((struct ipt_standard_target *)t)->verdict;
+	verdict = ((struct xt_standard_target *)t)->verdict;
 	verdict = -verdict - 1;
 	return verdict == NF_DROP || verdict == NF_ACCEPT;
 }
@@ -738,7 +738,7 @@ check_entry_size_and_hooks(struct ipt_entry *e,
 	}
 
 	if (e->next_offset
-	    < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) {
+	    < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) {
 		duprintf("checking: element %p size %u\n",
 			 e, e->next_offset);
 		return -EINVAL;
@@ -771,7 +771,7 @@ static void
 cleanup_entry(struct ipt_entry *e, struct net *net)
 {
 	struct xt_tgdtor_param par;
-	struct ipt_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_entry_match *ematch;
 
 	/* Cleanup all matches */
@@ -884,7 +884,7 @@ get_counters(const struct xt_table_info *t,
 	struct ipt_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu;
+	unsigned int curcpu = get_cpu();
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
@@ -894,19 +894,22 @@ get_counters(const struct xt_table_info *t,
 	 * if new softirq were to run and call ipt_do_table
 	 */
 	local_bh_disable();
-	curcpu = smp_processor_id();
-
 	i = 0;
 	xt_entry_foreach(iter, t->entries[curcpu], t->size) {
 		SET_COUNTER(counters[i], iter->counters.bcnt,
 			    iter->counters.pcnt);
 		++i;
 	}
+	local_bh_enable();
+	/* Processing counters from other cpus, we can let bottom half enabled,
+	 * (preemption is disabled)
+	 */
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		local_bh_disable();
 		xt_info_wrlock(cpu);
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
 			ADD_COUNTER(counters[i], iter->counters.bcnt,
@@ -914,8 +917,9 @@ get_counters(const struct xt_table_info *t,
 			++i; /* macro does multi eval of i */
 		}
 		xt_info_wrunlock(cpu);
+		local_bh_enable();
 	}
-	local_bh_enable();
+	put_cpu();
 }
 
 static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -928,7 +932,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 	   (other than comefrom, which userspace doesn't care
 	   about). */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vmalloc_node(countersize, numa_node_id());
+	counters = vmalloc(countersize);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -968,8 +972,8 @@ copy_entries_to_user(unsigned int total_size,
 	/* ... then go back and fix counters and names */
 	for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
 		unsigned int i;
-		const struct ipt_entry_match *m;
-		const struct ipt_entry_target *t;
+		const struct xt_entry_match *m;
+		const struct xt_entry_target *t;
 
 		e = (struct ipt_entry *)(loc_cpu_entry + off);
 		if (copy_to_user(userptr + off
@@ -986,7 +990,7 @@ copy_entries_to_user(unsigned int total_size,
 			m = (void *)e + i;
 
 			if (copy_to_user(userptr + off + i
-					 + offsetof(struct ipt_entry_match,
+					 + offsetof(struct xt_entry_match,
 						    u.user.name),
 					 m->u.kernel.match->name,
 					 strlen(m->u.kernel.match->name)+1)
@@ -998,7 +1002,7 @@ copy_entries_to_user(unsigned int total_size,
 
 		t = ipt_get_target_c(e);
 		if (copy_to_user(userptr + off + e->target_offset
-				 + offsetof(struct ipt_entry_target,
+				 + offsetof(struct xt_entry_target,
 					    u.user.name),
 				 t->u.kernel.target->name,
 				 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -1036,7 +1040,7 @@ static int compat_calc_entry(const struct ipt_entry *e,
 			     const void *base, struct xt_table_info *newinfo)
 {
 	const struct xt_entry_match *ematch;
-	const struct ipt_entry_target *t;
+	const struct xt_entry_target *t;
 	unsigned int entry_offset;
 	int off, i, ret;
 
@@ -1088,7 +1092,7 @@ static int compat_table_info(const struct xt_table_info *info,
 static int get_info(struct net *net, void __user *user,
                     const int *len, int compat)
 {
-	char name[IPT_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 	struct xt_table *t;
 	int ret;
 
@@ -1101,7 +1105,7 @@ static int get_info(struct net *net, void __user *user,
 	if (copy_from_user(name, user, sizeof(name)) != 0)
 		return -EFAULT;
 
-	name[IPT_TABLE_MAXNAMELEN-1] = '\0';
+	name[XT_TABLE_MAXNAMELEN-1] = '\0';
 #ifdef CONFIG_COMPAT
 	if (compat)
 		xt_compat_lock(AF_INET);
@@ -1352,7 +1356,7 @@ do_add_counters(struct net *net, const void __user *user,
 	if (len != size + num_counters * sizeof(struct xt_counters))
 		return -EINVAL;
 
-	paddc = vmalloc_node(len - size, numa_node_id());
+	paddc = vmalloc(len - size);
 	if (!paddc)
 		return -ENOMEM;
 
@@ -1396,14 +1400,14 @@ do_add_counters(struct net *net, const void __user *user,
 
 #ifdef CONFIG_COMPAT
 struct compat_ipt_replace {
-	char			name[IPT_TABLE_MAXNAMELEN];
+	char			name[XT_TABLE_MAXNAMELEN];
 	u32			valid_hooks;
 	u32			num_entries;
 	u32			size;
 	u32			hook_entry[NF_INET_NUMHOOKS];
 	u32			underflow[NF_INET_NUMHOOKS];
 	u32			num_counters;
-	compat_uptr_t		counters;	/* struct ipt_counters * */
+	compat_uptr_t		counters;	/* struct xt_counters * */
 	struct compat_ipt_entry	entries[0];
 };
 
@@ -1412,7 +1416,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
 			  unsigned int *size, struct xt_counters *counters,
 			  unsigned int i)
 {
-	struct ipt_entry_target *t;
+	struct xt_entry_target *t;
 	struct compat_ipt_entry __user *ce;
 	u_int16_t target_offset, next_offset;
 	compat_uint_t origsize;
@@ -1447,7 +1451,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
 }
 
 static int
-compat_find_calc_match(struct ipt_entry_match *m,
+compat_find_calc_match(struct xt_entry_match *m,
 		       const char *name,
 		       const struct ipt_ip *ip,
 		       unsigned int hookmask,
@@ -1469,7 +1473,7 @@ compat_find_calc_match(struct ipt_entry_match *m,
 
 static void compat_release_entry(struct compat_ipt_entry *e)
 {
-	struct ipt_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_entry_match *ematch;
 
 	/* Cleanup all matches */
@@ -1490,7 +1494,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 				  const char *name)
 {
 	struct xt_entry_match *ematch;
-	struct ipt_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	unsigned int entry_offset;
 	unsigned int j;
@@ -1572,7 +1576,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
 			    unsigned int *size, const char *name,
 			    struct xt_table_info *newinfo, unsigned char *base)
 {
-	struct ipt_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	struct ipt_entry *de;
 	unsigned int origsize;
@@ -1747,6 +1751,9 @@ translate_compat_table(struct net *net,
 		if (ret != 0)
 			break;
 		++i;
+		if (strcmp(ipt_get_target(iter1)->u.user.name,
+		    XT_ERROR_TARGET) == 0)
+			++newinfo->stacksize;
 	}
 	if (ret) {
 		/*
@@ -1877,7 +1884,7 @@ compat_do_ipt_set_ctl(struct sock *sk,	int cmd, void __user *user,
 }
 
 struct compat_ipt_get_entries {
-	char name[IPT_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 	compat_uint_t size;
 	struct compat_ipt_entry entrytable[0];
 };
@@ -2032,7 +2039,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 
 	case IPT_SO_GET_REVISION_MATCH:
 	case IPT_SO_GET_REVISION_TARGET: {
-		struct ipt_get_revision rev;
+		struct xt_get_revision rev;
 		int target;
 
 		if (*len != sizeof(rev)) {
@@ -2169,7 +2176,7 @@ static int icmp_checkentry(const struct xt_mtchk_param *par)
 
 static struct xt_target ipt_builtin_tg[] __read_mostly = {
 	{
-		.name             = IPT_STANDARD_TARGET,
+		.name             = XT_STANDARD_TARGET,
 		.targetsize       = sizeof(int),
 		.family           = NFPROTO_IPV4,
 #ifdef CONFIG_COMPAT
@@ -2179,9 +2186,9 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = {
 #endif
 	},
 	{
-		.name             = IPT_ERROR_TARGET,
+		.name             = XT_ERROR_TARGET,
 		.target           = ipt_error,
-		.targetsize       = IPT_FUNCTION_MAXNAMELEN,
+		.targetsize       = XT_FUNCTION_MAXNAMELEN,
 		.family           = NFPROTO_IPV4,
 	},
 };
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index f91c94b9a79..1e26a489765 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,6 +29,7 @@
 #include <net/netfilter/nf_conntrack.h>
 #include <net/net_namespace.h>
 #include <net/checksum.h>
+#include <net/ip.h>
 
 #define CLUSTERIP_VERSION "0.8"
 
@@ -53,12 +54,13 @@ struct clusterip_config {
 #endif
 	enum clusterip_hashmode hash_mode;	/* which hashing mode */
 	u_int32_t hash_initval;			/* hash initialization */
+	struct rcu_head rcu;
 };
 
 static LIST_HEAD(clusterip_configs);
 
 /* clusterip_lock protects the clusterip_configs list */
-static DEFINE_RWLOCK(clusterip_lock);
+static DEFINE_SPINLOCK(clusterip_lock);
 
 #ifdef CONFIG_PROC_FS
 static const struct file_operations clusterip_proc_fops;
@@ -71,11 +73,17 @@ clusterip_config_get(struct clusterip_config *c)
 	atomic_inc(&c->refcount);
 }
 
+
+static void clusterip_config_rcu_free(struct rcu_head *head)
+{
+	kfree(container_of(head, struct clusterip_config, rcu));
+}
+
 static inline void
 clusterip_config_put(struct clusterip_config *c)
 {
 	if (atomic_dec_and_test(&c->refcount))
-		kfree(c);
+		call_rcu_bh(&c->rcu, clusterip_config_rcu_free);
 }
 
 /* decrease the count of entries using/referencing this config.  If last
@@ -84,10 +92,11 @@ clusterip_config_put(struct clusterip_config *c)
 static inline void
 clusterip_config_entry_put(struct clusterip_config *c)
 {
-	write_lock_bh(&clusterip_lock);
-	if (atomic_dec_and_test(&c->entries)) {
-		list_del(&c->list);
-		write_unlock_bh(&clusterip_lock);
+	local_bh_disable();
+	if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) {
+		list_del_rcu(&c->list);
+		spin_unlock(&clusterip_lock);
+		local_bh_enable();
 
 		dev_mc_del(c->dev, c->clustermac);
 		dev_put(c->dev);
@@ -100,7 +109,7 @@ clusterip_config_entry_put(struct clusterip_config *c)
 #endif
 		return;
 	}
-	write_unlock_bh(&clusterip_lock);
+	local_bh_enable();
 }
 
 static struct clusterip_config *
@@ -108,7 +117,7 @@ __clusterip_config_find(__be32 clusterip)
 {
 	struct clusterip_config *c;
 
-	list_for_each_entry(c, &clusterip_configs, list) {
+	list_for_each_entry_rcu(c, &clusterip_configs, list) {
 		if (c->clusterip == clusterip)
 			return c;
 	}
@@ -121,16 +130,15 @@ clusterip_config_find_get(__be32 clusterip, int entry)
 {
 	struct clusterip_config *c;
 
-	read_lock_bh(&clusterip_lock);
+	rcu_read_lock_bh();
 	c = __clusterip_config_find(clusterip);
-	if (!c) {
-		read_unlock_bh(&clusterip_lock);
-		return NULL;
+	if (c) {
+		if (unlikely(!atomic_inc_not_zero(&c->refcount)))
+			c = NULL;
+		else if (entry)
+			atomic_inc(&c->entries);
 	}
-	atomic_inc(&c->refcount);
-	if (entry)
-		atomic_inc(&c->entries);
-	read_unlock_bh(&clusterip_lock);
+	rcu_read_unlock_bh();
 
 	return c;
 }
@@ -181,9 +189,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
 	}
 #endif
 
-	write_lock_bh(&clusterip_lock);
-	list_add(&c->list, &clusterip_configs);
-	write_unlock_bh(&clusterip_lock);
+	spin_lock_bh(&clusterip_lock);
+	list_add_rcu(&c->list, &clusterip_configs);
+	spin_unlock_bh(&clusterip_lock);
 
 	return c;
 }
@@ -224,24 +232,22 @@ clusterip_hashfn(const struct sk_buff *skb,
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	unsigned long hashval;
-	u_int16_t sport, dport;
-	const u_int16_t *ports;
-
-	switch (iph->protocol) {
-	case IPPROTO_TCP:
-	case IPPROTO_UDP:
-	case IPPROTO_UDPLITE:
-	case IPPROTO_SCTP:
-	case IPPROTO_DCCP:
-	case IPPROTO_ICMP:
-		ports = (const void *)iph+iph->ihl*4;
-		sport = ports[0];
-		dport = ports[1];
-		break;
-	default:
+	u_int16_t sport = 0, dport = 0;
+	int poff;
+
+	poff = proto_ports_offset(iph->protocol);
+	if (poff >= 0) {
+		const u_int16_t *ports;
+		u16 _ports[2];
+
+		ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
+		if (ports) {
+			sport = ports[0];
+			dport = ports[1];
+		}
+	} else {
 		if (net_ratelimit())
 			pr_info("unknown protocol %u\n", iph->protocol);
-		sport = dport = 0;
 	}
 
 	switch (config->hash_mode) {
@@ -462,7 +468,7 @@ struct arp_payload {
 	__be32 src_ip;
 	u_int8_t dst_hw[ETH_ALEN];
 	__be32 dst_ip;
-} __attribute__ ((packed));
+} __packed;
 
 #ifdef DEBUG
 static void arp_print(struct arp_payload *payload)
@@ -733,6 +739,9 @@ static void __exit clusterip_tg_exit(void)
 #endif
 	nf_unregister_hook(&cip_arp_ops);
 	xt_unregister_target(&clusterip_tg_reg);
+
+	/* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
+	rcu_barrier_bh();
 }
 
 module_init(clusterip_tg_init);
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 5234f4f3499..72ffc8fda2e 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/skbuff.h>
+#include <linux/if_arp.h>
 #include <linux/ip.h>
 #include <net/icmp.h>
 #include <net/udp.h>
@@ -23,16 +24,15 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ipt_LOG.h>
 #include <net/netfilter/nf_log.h>
+#include <net/netfilter/xt_log.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog");
 
-/* Use lock to serialize, so printks don't overlap */
-static DEFINE_SPINLOCK(log_lock);
-
 /* One level of recursion won't kill us */
-static void dump_packet(const struct nf_loginfo *info,
+static void dump_packet(struct sbuff *m,
+			const struct nf_loginfo *info,
 			const struct sk_buff *skb,
 			unsigned int iphoff)
 {
@@ -47,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info,
 
 	ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
 	if (ih == NULL) {
-		printk("TRUNCATED");
+		sb_add(m, "TRUNCATED");
 		return;
 	}
 
 	/* Important fields:
 	 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
 	/* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
-	printk("SRC=%pI4 DST=%pI4 ",
+	sb_add(m, "SRC=%pI4 DST=%pI4 ",
 	       &ih->saddr, &ih->daddr);
 
 	/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
-	printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+	sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
 	       ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
 	       ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
 
 	/* Max length: 6 "CE DF MF " */
 	if (ntohs(ih->frag_off) & IP_CE)
-		printk("CE ");
+		sb_add(m, "CE ");
 	if (ntohs(ih->frag_off) & IP_DF)
-		printk("DF ");
+		sb_add(m, "DF ");
 	if (ntohs(ih->frag_off) & IP_MF)
-		printk("MF ");
+		sb_add(m, "MF ");
 
 	/* Max length: 11 "FRAG:65535 " */
 	if (ntohs(ih->frag_off) & IP_OFFSET)
-		printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+		sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
 
 	if ((logflags & IPT_LOG_IPOPT) &&
 	    ih->ihl * 4 > sizeof(struct iphdr)) {
@@ -84,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info,
 		op = skb_header_pointer(skb, iphoff+sizeof(_iph),
 					optsize, _opt);
 		if (op == NULL) {
-			printk("TRUNCATED");
+			sb_add(m, "TRUNCATED");
 			return;
 		}
 
 		/* Max length: 127 "OPT (" 15*4*2chars ") " */
-		printk("OPT (");
+		sb_add(m, "OPT (");
 		for (i = 0; i < optsize; i++)
-			printk("%02X", op[i]);
-		printk(") ");
+			sb_add(m, "%02X", op[i]);
+		sb_add(m, ") ");
 	}
 
 	switch (ih->protocol) {
@@ -101,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info,
 		const struct tcphdr *th;
 
 		/* Max length: 10 "PROTO=TCP " */
-		printk("PROTO=TCP ");
+		sb_add(m, "PROTO=TCP ");
 
 		if (ntohs(ih->frag_off) & IP_OFFSET)
 			break;
@@ -110,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info,
 		th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
 					sizeof(_tcph), &_tcph);
 		if (th == NULL) {
-			printk("INCOMPLETE [%u bytes] ",
+			sb_add(m, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
 
 		/* Max length: 20 "SPT=65535 DPT=65535 " */
-		printk("SPT=%u DPT=%u ",
+		sb_add(m, "SPT=%u DPT=%u ",
 		       ntohs(th->source), ntohs(th->dest));
 		/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
 		if (logflags & IPT_LOG_TCPSEQ)
-			printk("SEQ=%u ACK=%u ",
+			sb_add(m, "SEQ=%u ACK=%u ",
 			       ntohl(th->seq), ntohl(th->ack_seq));
 		/* Max length: 13 "WINDOW=65535 " */
-		printk("WINDOW=%u ", ntohs(th->window));
+		sb_add(m, "WINDOW=%u ", ntohs(th->window));
 		/* Max length: 9 "RES=0x3F " */
-		printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+		sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
 		/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
 		if (th->cwr)
-			printk("CWR ");
+			sb_add(m, "CWR ");
 		if (th->ece)
-			printk("ECE ");
+			sb_add(m, "ECE ");
 		if (th->urg)
-			printk("URG ");
+			sb_add(m, "URG ");
 		if (th->ack)
-			printk("ACK ");
+			sb_add(m, "ACK ");
 		if (th->psh)
-			printk("PSH ");
+			sb_add(m, "PSH ");
 		if (th->rst)
-			printk("RST ");
+			sb_add(m, "RST ");
 		if (th->syn)
-			printk("SYN ");
+			sb_add(m, "SYN ");
 		if (th->fin)
-			printk("FIN ");
+			sb_add(m, "FIN ");
 		/* Max length: 11 "URGP=65535 " */
-		printk("URGP=%u ", ntohs(th->urg_ptr));
+		sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
 
 		if ((logflags & IPT_LOG_TCPOPT) &&
 		    th->doff * 4 > sizeof(struct tcphdr)) {
@@ -157,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info,
 						iphoff+ih->ihl*4+sizeof(_tcph),
 						optsize, _opt);
 			if (op == NULL) {
-				printk("TRUNCATED");
+				sb_add(m, "TRUNCATED");
 				return;
 			}
 
 			/* Max length: 127 "OPT (" 15*4*2chars ") " */
-			printk("OPT (");
+			sb_add(m, "OPT (");
 			for (i = 0; i < optsize; i++)
-				printk("%02X", op[i]);
-			printk(") ");
+				sb_add(m, "%02X", op[i]);
+			sb_add(m, ") ");
 		}
 		break;
 	}
@@ -176,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info,
 
 		if (ih->protocol == IPPROTO_UDP)
 			/* Max length: 10 "PROTO=UDP "     */
-			printk("PROTO=UDP " );
+			sb_add(m, "PROTO=UDP " );
 		else	/* Max length: 14 "PROTO=UDPLITE " */
-			printk("PROTO=UDPLITE ");
+			sb_add(m, "PROTO=UDPLITE ");
 
 		if (ntohs(ih->frag_off) & IP_OFFSET)
 			break;
@@ -187,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info,
 		uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
 					sizeof(_udph), &_udph);
 		if (uh == NULL) {
-			printk("INCOMPLETE [%u bytes] ",
+			sb_add(m, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
 
 		/* Max length: 20 "SPT=65535 DPT=65535 " */
-		printk("SPT=%u DPT=%u LEN=%u ",
+		sb_add(m, "SPT=%u DPT=%u LEN=%u ",
 		       ntohs(uh->source), ntohs(uh->dest),
 		       ntohs(uh->len));
 		break;
@@ -220,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info,
 			    [ICMP_ADDRESSREPLY] = 12 };
 
 		/* Max length: 11 "PROTO=ICMP " */
-		printk("PROTO=ICMP ");
+		sb_add(m, "PROTO=ICMP ");
 
 		if (ntohs(ih->frag_off) & IP_OFFSET)
 			break;
@@ -229,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info,
 		ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
 					 sizeof(_icmph), &_icmph);
 		if (ich == NULL) {
-			printk("INCOMPLETE [%u bytes] ",
+			sb_add(m, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
 
 		/* Max length: 18 "TYPE=255 CODE=255 " */
-		printk("TYPE=%u CODE=%u ", ich->type, ich->code);
+		sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
 
 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
 		if (ich->type <= NR_ICMP_TYPES &&
 		    required_len[ich->type] &&
 		    skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
-			printk("INCOMPLETE [%u bytes] ",
+			sb_add(m, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
@@ -250,35 +250,35 @@ static void dump_packet(const struct nf_loginfo *info,
 		case ICMP_ECHOREPLY:
 		case ICMP_ECHO:
 			/* Max length: 19 "ID=65535 SEQ=65535 " */
-			printk("ID=%u SEQ=%u ",
+			sb_add(m, "ID=%u SEQ=%u ",
 			       ntohs(ich->un.echo.id),
 			       ntohs(ich->un.echo.sequence));
 			break;
 
 		case ICMP_PARAMETERPROB:
 			/* Max length: 14 "PARAMETER=255 " */
-			printk("PARAMETER=%u ",
+			sb_add(m, "PARAMETER=%u ",
 			       ntohl(ich->un.gateway) >> 24);
 			break;
 		case ICMP_REDIRECT:
 			/* Max length: 24 "GATEWAY=255.255.255.255 " */
-			printk("GATEWAY=%pI4 ", &ich->un.gateway);
+			sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
 			/* Fall through */
 		case ICMP_DEST_UNREACH:
 		case ICMP_SOURCE_QUENCH:
 		case ICMP_TIME_EXCEEDED:
 			/* Max length: 3+maxlen */
 			if (!iphoff) { /* Only recurse once. */
-				printk("[");
-				dump_packet(info, skb,
+				sb_add(m, "[");
+				dump_packet(m, info, skb,
 					    iphoff + ih->ihl*4+sizeof(_icmph));
-				printk("] ");
+				sb_add(m, "] ");
 			}
 
 			/* Max length: 10 "MTU=65535 " */
 			if (ich->type == ICMP_DEST_UNREACH &&
 			    ich->code == ICMP_FRAG_NEEDED)
-				printk("MTU=%u ", ntohs(ich->un.frag.mtu));
+				sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu));
 		}
 		break;
 	}
@@ -291,19 +291,19 @@ static void dump_packet(const struct nf_loginfo *info,
 			break;
 
 		/* Max length: 9 "PROTO=AH " */
-		printk("PROTO=AH ");
+		sb_add(m, "PROTO=AH ");
 
 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
 		ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
 					sizeof(_ahdr), &_ahdr);
 		if (ah == NULL) {
-			printk("INCOMPLETE [%u bytes] ",
+			sb_add(m, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
 
 		/* Length: 15 "SPI=0xF1234567 " */
-		printk("SPI=0x%x ", ntohl(ah->spi));
+		sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
 		break;
 	}
 	case IPPROTO_ESP: {
@@ -311,7 +311,7 @@ static void dump_packet(const struct nf_loginfo *info,
 		const struct ip_esp_hdr *eh;
 
 		/* Max length: 10 "PROTO=ESP " */
-		printk("PROTO=ESP ");
+		sb_add(m, "PROTO=ESP ");
 
 		if (ntohs(ih->frag_off) & IP_OFFSET)
 			break;
@@ -320,25 +320,25 @@ static void dump_packet(const struct nf_loginfo *info,
 		eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
 					sizeof(_esph), &_esph);
 		if (eh == NULL) {
-			printk("INCOMPLETE [%u bytes] ",
+			sb_add(m, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
 
 		/* Length: 15 "SPI=0xF1234567 " */
-		printk("SPI=0x%x ", ntohl(eh->spi));
+		sb_add(m, "SPI=0x%x ", ntohl(eh->spi));
 		break;
 	}
 	/* Max length: 10 "PROTO 255 " */
 	default:
-		printk("PROTO=%u ", ih->protocol);
+		sb_add(m, "PROTO=%u ", ih->protocol);
 	}
 
 	/* Max length: 15 "UID=4294967295 " */
 	if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
 		read_lock_bh(&skb->sk->sk_callback_lock);
 		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
-			printk("UID=%u GID=%u ",
+			sb_add(m, "UID=%u GID=%u ",
 				skb->sk->sk_socket->file->f_cred->fsuid,
 				skb->sk->sk_socket->file->f_cred->fsgid);
 		read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -346,7 +346,7 @@ static void dump_packet(const struct nf_loginfo *info,
 
 	/* Max length: 16 "MARK=0xFFFFFFFF " */
 	if (!iphoff && skb->mark)
-		printk("MARK=0x%x ", skb->mark);
+		sb_add(m, "MARK=0x%x ", skb->mark);
 
 	/* Proto    Max log string length */
 	/* IP:      40+46+6+11+127 = 230 */
@@ -363,6 +363,43 @@ static void dump_packet(const struct nf_loginfo *info,
 	/* maxlen = 230+   91  + 230 + 252 = 803 */
 }
 
+static void dump_mac_header(struct sbuff *m,
+			    const struct nf_loginfo *info,
+			    const struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	unsigned int logflags = 0;
+
+	if (info->type == NF_LOG_TYPE_LOG)
+		logflags = info->u.log.logflags;
+
+	if (!(logflags & IPT_LOG_MACDECODE))
+		goto fallback;
+
+	switch (dev->type) {
+	case ARPHRD_ETHER:
+		sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+		       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+		       ntohs(eth_hdr(skb)->h_proto));
+		return;
+	default:
+		break;
+	}
+
+fallback:
+	sb_add(m, "MAC=");
+	if (dev->hard_header_len &&
+	    skb->mac_header != skb->network_header) {
+		const unsigned char *p = skb_mac_header(skb);
+		unsigned int i;
+
+		sb_add(m, "%02x", *p++);
+		for (i = 1; i < dev->hard_header_len; i++, p++)
+			sb_add(m, ":%02x", *p);
+	}
+	sb_add(m, " ");
+}
+
 static struct nf_loginfo default_loginfo = {
 	.type	= NF_LOG_TYPE_LOG,
 	.u = {
@@ -382,11 +419,12 @@ ipt_log_packet(u_int8_t pf,
 	       const struct nf_loginfo *loginfo,
 	       const char *prefix)
 {
+	struct sbuff *m = sb_open();
+
 	if (!loginfo)
 		loginfo = &default_loginfo;
 
-	spin_lock_bh(&log_lock);
-	printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+	sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
 	       prefix,
 	       in ? in->name : "",
 	       out ? out->name : "");
@@ -397,31 +435,20 @@ ipt_log_packet(u_int8_t pf,
 
 		physindev = skb->nf_bridge->physindev;
 		if (physindev && in != physindev)
-			printk("PHYSIN=%s ", physindev->name);
+			sb_add(m, "PHYSIN=%s ", physindev->name);
 		physoutdev = skb->nf_bridge->physoutdev;
 		if (physoutdev && out != physoutdev)
-			printk("PHYSOUT=%s ", physoutdev->name);
+			sb_add(m, "PHYSOUT=%s ", physoutdev->name);
 	}
 #endif
 
-	if (in && !out) {
-		/* MAC logging for input chain only. */
-		printk("MAC=");
-		if (skb->dev && skb->dev->hard_header_len &&
-		    skb->mac_header != skb->network_header) {
-			int i;
-			const unsigned char *p = skb_mac_header(skb);
-			for (i = 0; i < skb->dev->hard_header_len; i++,p++)
-				printk("%02x%c", *p,
-				       i==skb->dev->hard_header_len - 1
-				       ? ' ':':');
-		} else
-			printk(" ");
-	}
+	/* MAC logging for input path only. */
+	if (in && !out)
+		dump_mac_header(m, loginfo, skb);
+
+	dump_packet(m, loginfo, skb, 0);
 
-	dump_packet(loginfo, skb, 0);
-	printk("\n");
-	spin_unlock_bh(&log_lock);
+	sb_close(m);
 }
 
 static unsigned int
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index f43867d1697..6cdb298f103 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -48,7 +48,8 @@ netmap_tg(struct sk_buff *skb, const struct xt_action_param *par)
 
 	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
 		     par->hooknum == NF_INET_POST_ROUTING ||
-		     par->hooknum == NF_INET_LOCAL_OUT);
+		     par->hooknum == NF_INET_LOCAL_OUT ||
+		     par->hooknum == NF_INET_LOCAL_IN);
 	ct = nf_ct_get(skb, &ctinfo);
 
 	netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
@@ -77,7 +78,8 @@ static struct xt_target netmap_tg_reg __read_mostly = {
 	.table		= "nat",
 	.hooks		= (1 << NF_INET_PRE_ROUTING) |
 			  (1 << NF_INET_POST_ROUTING) |
-			  (1 << NF_INET_LOCAL_OUT),
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_LOCAL_IN),
 	.checkentry 	= netmap_tg_check,
 	.me 		= THIS_MODULE
 };
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index f5f4a888e4e..43eec80c0e7 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -95,10 +95,11 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 	}
 
 	tcph->rst	= 1;
-	tcph->check	= tcp_v4_check(sizeof(struct tcphdr),
-				       niph->saddr, niph->daddr,
-				       csum_partial(tcph,
-						    sizeof(struct tcphdr), 0));
+	tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
+				    niph->daddr, 0);
+	nskb->ip_summed = CHECKSUM_PARTIAL;
+	nskb->csum_start = (unsigned char *)tcph - nskb->head;
+	nskb->csum_offset = offsetof(struct tcphdr, check);
 
 	addr_type = RTN_UNSPEC;
 	if (hook != NF_INET_FORWARD
@@ -109,13 +110,13 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 		addr_type = RTN_LOCAL;
 
 	/* ip_route_me_harder expects skb->dst to be set */
-	skb_dst_set(nskb, dst_clone(skb_dst(oldskb)));
+	skb_dst_set_noref(nskb, skb_dst(oldskb));
 
+	nskb->protocol = htons(ETH_P_IP);
 	if (ip_route_me_harder(nskb, addr_type))
 		goto free_nskb;
 
 	niph->ttl	= dst_metric(skb_dst(nskb), RTAX_HOPLIMIT);
-	nskb->ip_summed = CHECKSUM_NONE;
 
 	/* "Never happens" */
 	if (nskb->len > dst_mtu(skb_dst(nskb)))
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 244f7cb08d6..37f8adb68c7 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -11,6 +11,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/percpu.h>
+#include <linux/security.h>
 #include <net/net_namespace.h>
 
 #include <linux/netfilter.h>
@@ -87,6 +88,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
 	rcu_read_unlock();
 }
 
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+	int ret;
+	u32 len;
+	char *secctx;
+
+	ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+	if (ret)
+		return ret;
+
+	ret = seq_printf(s, "secctx=%s ", secctx);
+
+	security_release_secctx(secctx, len);
+	return ret;
+}
+#else
+static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+	return 0;
+}
+#endif
+
 static int ct_seq_show(struct seq_file *s, void *v)
 {
 	struct nf_conntrack_tuple_hash *hash = v;
@@ -148,10 +172,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
 		goto release;
 #endif
 
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
-	if (seq_printf(s, "secmark=%u ", ct->secmark))
+	if (ct_show_secctx(s, ct))
 		goto release;
-#endif
 
 	if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
 		goto release;
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index cb763ae9ed9..f3a9b42b16c 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -66,6 +66,13 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
 					  const struct net_device *out,
 					  int (*okfn)(struct sk_buff *))
 {
+	struct sock *sk = skb->sk;
+	struct inet_sock *inet = inet_sk(skb->sk);
+
+	if (sk && (sk->sk_family == PF_INET) &&
+	    inet->nodefrag)
+		return NF_ACCEPT;
+
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 #if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE)
 	/* Previously seen (loopback)?  Ignore.  Do this before
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index c31b8766825..0f23b3f06df 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,9 +44,16 @@ static unsigned int help(struct sk_buff *skb,
 
 	/* Try to get same port: if not, try to change it. */
 	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+		int ret;
+
 		exp->tuple.dst.u.tcp.port = htons(port);
-		if (nf_ct_expect_related(exp) == 0)
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			port = 0;
 			break;
+		}
 	}
 
 	if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 4f8bddb760c..295c97431e4 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -38,7 +38,7 @@ static DEFINE_SPINLOCK(nf_nat_lock);
 static struct nf_conntrack_l3proto *l3proto __read_mostly;
 
 #define MAX_IP_NAT_PROTO 256
-static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
+static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
 						__read_mostly;
 
 static inline const struct nf_nat_protocol *
@@ -47,7 +47,7 @@ __nf_nat_proto_find(u_int8_t protonum)
 	return rcu_dereference(nf_nat_protos[protonum]);
 }
 
-const struct nf_nat_protocol *
+static const struct nf_nat_protocol *
 nf_nat_proto_find_get(u_int8_t protonum)
 {
 	const struct nf_nat_protocol *p;
@@ -60,14 +60,12 @@ nf_nat_proto_find_get(u_int8_t protonum)
 
 	return p;
 }
-EXPORT_SYMBOL_GPL(nf_nat_proto_find_get);
 
-void
+static void
 nf_nat_proto_put(const struct nf_nat_protocol *p)
 {
 	module_put(p->me);
 }
-EXPORT_SYMBOL_GPL(nf_nat_proto_put);
 
 /* We keep an extra hash for each conntrack, for fast searching. */
 static inline unsigned int
@@ -261,17 +259,18 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 	rcu_read_lock();
 	proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
 
-	/* Change protocol info to have some randomization */
-	if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) {
-		proto->unique_tuple(tuple, range, maniptype, ct);
-		goto out;
-	}
-
 	/* Only bother mapping if it's not already in range and unique */
-	if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
-	     proto->in_range(tuple, maniptype, &range->min, &range->max)) &&
-	    !nf_nat_used_tuple(tuple, ct))
-		goto out;
+	if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
+		if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) {
+			if (proto->in_range(tuple, maniptype, &range->min,
+					    &range->max) &&
+			    (range->min.all == range->max.all ||
+			     !nf_nat_used_tuple(tuple, ct)))
+				goto out;
+		} else if (!nf_nat_used_tuple(tuple, ct)) {
+			goto out;
+		}
+	}
 
 	/* Last change: get protocol to try to obtain unique tuple. */
 	proto->unique_tuple(tuple, range, maniptype, ct);
@@ -440,7 +439,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
 	if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
 		return 0;
 
-	inside = (void *)skb->data + ip_hdrlen(skb);
+	inside = (void *)skb->data + hdrlen;
 
 	/* We're actually going to mangle it beyond trivial checksum
 	   adjustment, so make sure the current checksum is correct. */
@@ -463,6 +462,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
 			return 0;
 	}
 
+	if (manip == IP_NAT_MANIP_SRC)
+		statusbit = IPS_SRC_NAT;
+	else
+		statusbit = IPS_DST_NAT;
+
+	/* Invert if this is reply dir. */
+	if (dir == IP_CT_DIR_REPLY)
+		statusbit ^= IPS_NAT_MASK;
+
+	if (!(ct->status & statusbit))
+		return 1;
+
 	pr_debug("icmp_reply_translation: translating error %p manip %u "
 		 "dir %s\n", skb, manip,
 		 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
@@ -470,12 +481,10 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
 	/* rcu_read_lock()ed by nf_hook_slow */
 	l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol);
 
-	if (!nf_ct_get_tuple(skb,
-			     ip_hdrlen(skb) + sizeof(struct icmphdr),
-			     (ip_hdrlen(skb) +
+	if (!nf_ct_get_tuple(skb, hdrlen + sizeof(struct icmphdr),
+			     (hdrlen +
 			      sizeof(struct icmphdr) + inside->ip.ihl * 4),
-			     (u_int16_t)AF_INET,
-			     inside->ip.protocol,
+			     (u_int16_t)AF_INET, inside->ip.protocol,
 			     &inner, l3proto, l4proto))
 		return 0;
 
@@ -484,15 +493,13 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
 	   pass all hooks (locally-generated ICMP).  Consider incoming
 	   packet: PREROUTING (DST manip), routing produces ICMP, goes
 	   through POSTROUTING (which must correct the DST manip). */
-	if (!manip_pkt(inside->ip.protocol, skb,
-		       ip_hdrlen(skb) + sizeof(inside->icmp),
-		       &ct->tuplehash[!dir].tuple,
-		       !manip))
+	if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp),
+		       &ct->tuplehash[!dir].tuple, !manip))
 		return 0;
 
 	if (skb->ip_summed != CHECKSUM_PARTIAL) {
 		/* Reloading "inside" here since manip_pkt inner. */
-		inside = (void *)skb->data + ip_hdrlen(skb);
+		inside = (void *)skb->data + hdrlen;
 		inside->icmp.checksum = 0;
 		inside->icmp.checksum =
 			csum_fold(skb_checksum(skb, hdrlen,
@@ -501,20 +508,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
 
 	/* Change outer to look the reply to an incoming packet
 	 * (proto 0 means don't invert per-proto part). */
-	if (manip == IP_NAT_MANIP_SRC)
-		statusbit = IPS_SRC_NAT;
-	else
-		statusbit = IPS_DST_NAT;
-
-	/* Invert if this is reply dir. */
-	if (dir == IP_CT_DIR_REPLY)
-		statusbit ^= IPS_NAT_MASK;
-
-	if (ct->status & statusbit) {
-		nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
-		if (!manip_pkt(0, skb, 0, &target, manip))
-			return 0;
-	}
+	nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+	if (!manip_pkt(0, skb, 0, &target, manip))
+		return 0;
 
 	return 1;
 }
@@ -742,7 +738,7 @@ static int __init nf_nat_init(void)
 	spin_unlock_bh(&nf_nat_lock);
 
 	/* Initialize fake conntrack so that NAT will skip it */
-	nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
+	nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
 
 	l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
 
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
index 86e0e84ff0a..dc73abb3fe2 100644
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ b/net/ipv4/netfilter/nf_nat_ftp.c
@@ -79,9 +79,16 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
 
 	/* Try to get same port: if not, try to change it. */
 	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+		int ret;
+
 		exp->tuple.dst.u.tcp.port = htons(port);
-		if (nf_ct_expect_related(exp) == 0)
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			port = 0;
 			break;
+		}
 	}
 
 	if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 5045196d853..790f3160e01 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -222,13 +222,24 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
 	/* Try to get a pair of ports. */
 	for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
 	     nated_port != 0; nated_port += 2) {
+		int ret;
+
 		rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
-		if (nf_ct_expect_related(rtp_exp) == 0) {
+		ret = nf_ct_expect_related(rtp_exp);
+		if (ret == 0) {
 			rtcp_exp->tuple.dst.u.udp.port =
 			    htons(nated_port + 1);
-			if (nf_ct_expect_related(rtcp_exp) == 0)
+			ret = nf_ct_expect_related(rtcp_exp);
+			if (ret == 0)
+				break;
+			else if (ret != -EBUSY) {
+				nf_ct_unexpect_related(rtp_exp);
+				nated_port = 0;
 				break;
-			nf_ct_unexpect_related(rtp_exp);
+			}
+		} else if (ret != -EBUSY) {
+			nated_port = 0;
+			break;
 		}
 	}
 
@@ -284,9 +295,16 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
 
 	/* Try to get same port: if not, try to change it. */
 	for (; nated_port != 0; nated_port++) {
+		int ret;
+
 		exp->tuple.dst.u.tcp.port = htons(nated_port);
-		if (nf_ct_expect_related(exp) == 0)
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			nated_port = 0;
 			break;
+		}
 	}
 
 	if (nated_port == 0) {	/* No port available */
@@ -334,9 +352,16 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
 
 	/* Try to get same port: if not, try to change it. */
 	for (; nated_port != 0; nated_port++) {
+		int ret;
+
 		exp->tuple.dst.u.tcp.port = htons(nated_port);
-		if (nf_ct_expect_related(exp) == 0)
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
 			break;
+		else if (ret != -EBUSY) {
+			nated_port = 0;
+			break;
+		}
 	}
 
 	if (nated_port == 0) {	/* No port available */
@@ -418,9 +443,16 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
 
 	/* Try to get same port: if not, try to change it. */
 	for (; nated_port != 0; nated_port++) {
+		int ret;
+
 		exp->tuple.dst.u.tcp.port = htons(nated_port);
-		if (nf_ct_expect_related(exp) == 0)
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			nated_port = 0;
 			break;
+		}
 	}
 
 	if (nated_port == 0) {	/* No port available */
@@ -500,9 +532,16 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
 
 	/* Try to get same port: if not, try to change it. */
 	for (nated_port = ntohs(port); nated_port != 0; nated_port++) {
+		int ret;
+
 		exp->tuple.dst.u.tcp.port = htons(nated_port);
-		if (nf_ct_expect_related(exp) == 0)
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
 			break;
+		else if (ret != -EBUSY) {
+			nated_port = 0;
+			break;
+		}
 	}
 
 	if (nated_port == 0) {	/* No port available */
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 4a0c6b548ee..31427fb57aa 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -153,6 +153,35 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 }
 EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
 
+static void nf_nat_csum(struct sk_buff *skb, struct iphdr *iph, void *data,
+			int datalen, __sum16 *check, int oldlen)
+{
+	struct rtable *rt = skb_rtable(skb);
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		if (!(rt->rt_flags & RTCF_LOCAL) &&
+		    skb->dev->features & NETIF_F_V4_CSUM) {
+			skb->ip_summed = CHECKSUM_PARTIAL;
+			skb->csum_start = skb_headroom(skb) +
+					  skb_network_offset(skb) +
+					  iph->ihl * 4;
+			skb->csum_offset = (void *)check - data;
+			*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+						    datalen, iph->protocol, 0);
+		} else {
+			*check = 0;
+			*check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+						   datalen, iph->protocol,
+						   csum_partial(data, datalen,
+								0));
+			if (iph->protocol == IPPROTO_UDP && !*check)
+				*check = CSUM_MANGLED_0;
+		}
+	} else
+		inet_proto_csum_replace2(check, skb,
+					 htons(oldlen), htons(datalen), 1);
+}
+
 /* Generic function for mangling variable-length address changes inside
  * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
  * command in FTP).
@@ -169,7 +198,6 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
 			       const char *rep_buffer,
 			       unsigned int rep_len, bool adjust)
 {
-	struct rtable *rt = skb_rtable(skb);
 	struct iphdr *iph;
 	struct tcphdr *tcph;
 	int oldlen, datalen;
@@ -192,26 +220,7 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
 			match_offset, match_len, rep_buffer, rep_len);
 
 	datalen = skb->len - iph->ihl*4;
-	if (skb->ip_summed != CHECKSUM_PARTIAL) {
-		if (!(rt->rt_flags & RTCF_LOCAL) &&
-		    skb->dev->features & NETIF_F_V4_CSUM) {
-			skb->ip_summed = CHECKSUM_PARTIAL;
-			skb->csum_start = skb_headroom(skb) +
-					  skb_network_offset(skb) +
-					  iph->ihl * 4;
-			skb->csum_offset = offsetof(struct tcphdr, check);
-			tcph->check = ~tcp_v4_check(datalen,
-						    iph->saddr, iph->daddr, 0);
-		} else {
-			tcph->check = 0;
-			tcph->check = tcp_v4_check(datalen,
-						   iph->saddr, iph->daddr,
-						   csum_partial(tcph,
-								datalen, 0));
-		}
-	} else
-		inet_proto_csum_replace2(&tcph->check, skb,
-					 htons(oldlen), htons(datalen), 1);
+	nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
 
 	if (adjust && rep_len != match_len)
 		nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
@@ -240,7 +249,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
 			 const char *rep_buffer,
 			 unsigned int rep_len)
 {
-	struct rtable *rt = skb_rtable(skb);
 	struct iphdr *iph;
 	struct udphdr *udph;
 	int datalen, oldlen;
@@ -274,29 +282,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
 	if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
 		return 1;
 
-	if (skb->ip_summed != CHECKSUM_PARTIAL) {
-		if (!(rt->rt_flags & RTCF_LOCAL) &&
-		    skb->dev->features & NETIF_F_V4_CSUM) {
-			skb->ip_summed = CHECKSUM_PARTIAL;
-			skb->csum_start = skb_headroom(skb) +
-					  skb_network_offset(skb) +
-					  iph->ihl * 4;
-			skb->csum_offset = offsetof(struct udphdr, check);
-			udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
-							 datalen, IPPROTO_UDP,
-							 0);
-		} else {
-			udph->check = 0;
-			udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
-							datalen, IPPROTO_UDP,
-							csum_partial(udph,
-								     datalen, 0));
-			if (!udph->check)
-				udph->check = CSUM_MANGLED_0;
-		}
-	} else
-		inet_proto_csum_replace2(&udph->check, skb,
-					 htons(oldlen), htons(datalen), 1);
+	nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
 
 	return 1;
 }
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
index ea83a886b03..535e1a80235 100644
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ b/net/ipv4/netfilter/nf_nat_irc.c
@@ -45,9 +45,16 @@ static unsigned int help(struct sk_buff *skb,
 
 	/* Try to get same port: if not, try to change it. */
 	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+		int ret;
+
 		exp->tuple.dst.u.tcp.port = htons(port);
-		if (nf_ct_expect_related(exp) == 0)
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			port = 0;
 			break;
+		}
 	}
 
 	if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c
index 6c4f11f5144..3e61faf23a9 100644
--- a/net/ipv4/netfilter/nf_nat_proto_common.c
+++ b/net/ipv4/netfilter/nf_nat_proto_common.c
@@ -34,7 +34,7 @@ bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple,
 }
 EXPORT_SYMBOL_GPL(nf_nat_proto_in_range);
 
-bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
+void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
 			       const struct nf_nat_range *range,
 			       enum nf_nat_manip_type maniptype,
 			       const struct nf_conn *ct,
@@ -53,7 +53,7 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
 	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
 		/* If it's dst rewrite, can't change port */
 		if (maniptype == IP_NAT_MANIP_DST)
-			return false;
+			return;
 
 		if (ntohs(*portptr) < 1024) {
 			/* Loose convention: >> 512 is credential passing */
@@ -81,15 +81,15 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
 	else
 		off = *rover;
 
-	for (i = 0; i < range_size; i++, off++) {
+	for (i = 0; ; ++off) {
 		*portptr = htons(min + off % range_size);
-		if (nf_nat_used_tuple(tuple, ct))
+		if (++i != range_size && nf_nat_used_tuple(tuple, ct))
 			continue;
 		if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM))
 			*rover = off;
-		return true;
+		return;
 	}
-	return false;
+	return;
 }
 EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple);
 
diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c
index 22485ce306d..570faf2667b 100644
--- a/net/ipv4/netfilter/nf_nat_proto_dccp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_dccp.c
@@ -22,14 +22,14 @@
 
 static u_int16_t dccp_port_rover;
 
-static bool
+static void
 dccp_unique_tuple(struct nf_conntrack_tuple *tuple,
 		  const struct nf_nat_range *range,
 		  enum nf_nat_manip_type maniptype,
 		  const struct nf_conn *ct)
 {
-	return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
-					 &dccp_port_rover);
+	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
+				  &dccp_port_rover);
 }
 
 static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index d7e89201351..bc8d83a31c7 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -37,7 +37,7 @@ MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
 MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
 
 /* generate unique tuple ... */
-static bool
+static void
 gre_unique_tuple(struct nf_conntrack_tuple *tuple,
 		 const struct nf_nat_range *range,
 		 enum nf_nat_manip_type maniptype,
@@ -50,7 +50,7 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
 	/* If there is no master conntrack we are not PPTP,
 	   do not change tuples */
 	if (!ct->master)
-		return false;
+		return;
 
 	if (maniptype == IP_NAT_MANIP_SRC)
 		keyptr = &tuple->src.u.gre.key;
@@ -68,14 +68,14 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
 
 	pr_debug("min = %u, range_size = %u\n", min, range_size);
 
-	for (i = 0; i < range_size; i++, key++) {
+	for (i = 0; ; ++key) {
 		*keyptr = htons(min + key % range_size);
-		if (!nf_nat_used_tuple(tuple, ct))
-			return true;
+		if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+			return;
 	}
 
 	pr_debug("%p: no NAT mapping\n", ct);
-	return false;
+	return;
 }
 
 /* manipulate a GRE packet according to maniptype */
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 19a8b0b07d8..5744c3ec847 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -27,7 +27,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
 	       ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
 }
 
-static bool
+static void
 icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
 		  const struct nf_nat_range *range,
 		  enum nf_nat_manip_type maniptype,
@@ -42,13 +42,13 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
 	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
 		range_size = 0xFFFF;
 
-	for (i = 0; i < range_size; i++, id++) {
+	for (i = 0; ; ++id) {
 		tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
 					     (id % range_size));
-		if (!nf_nat_used_tuple(tuple, ct))
-			return true;
+		if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+			return;
 	}
-	return false;
+	return;
 }
 
 static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c
index 3fc598eeeb1..756331d4266 100644
--- a/net/ipv4/netfilter/nf_nat_proto_sctp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c
@@ -16,14 +16,14 @@
 
 static u_int16_t nf_sctp_port_rover;
 
-static bool
+static void
 sctp_unique_tuple(struct nf_conntrack_tuple *tuple,
 		  const struct nf_nat_range *range,
 		  enum nf_nat_manip_type maniptype,
 		  const struct nf_conn *ct)
 {
-	return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
-					 &nf_sctp_port_rover);
+	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
+				  &nf_sctp_port_rover);
 }
 
 static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c
index 399e2cfa263..aa460a595d5 100644
--- a/net/ipv4/netfilter/nf_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c
@@ -20,14 +20,13 @@
 
 static u_int16_t tcp_port_rover;
 
-static bool
+static void
 tcp_unique_tuple(struct nf_conntrack_tuple *tuple,
 		 const struct nf_nat_range *range,
 		 enum nf_nat_manip_type maniptype,
 		 const struct nf_conn *ct)
 {
-	return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
-					 &tcp_port_rover);
+	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover);
 }
 
 static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c
index 9e61c79492e..dfe65c7e292 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udp.c
@@ -19,14 +19,13 @@
 
 static u_int16_t udp_port_rover;
 
-static bool
+static void
 udp_unique_tuple(struct nf_conntrack_tuple *tuple,
 		 const struct nf_nat_range *range,
 		 enum nf_nat_manip_type maniptype,
 		 const struct nf_conn *ct)
 {
-	return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
-					 &udp_port_rover);
+	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover);
 }
 
 static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c
index 440a229bbd8..3cc8c8af39e 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udplite.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udplite.c
@@ -18,14 +18,14 @@
 
 static u_int16_t udplite_port_rover;
 
-static bool
+static void
 udplite_unique_tuple(struct nf_conntrack_tuple *tuple,
 		     const struct nf_nat_range *range,
 		     enum nf_nat_manip_type maniptype,
 		     const struct nf_conn *ct)
 {
-	return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
-					 &udplite_port_rover);
+	nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
+				  &udplite_port_rover);
 }
 
 static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c
index 14381c62ace..a50f2bc1c73 100644
--- a/net/ipv4/netfilter/nf_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/nf_nat_proto_unknown.c
@@ -26,14 +26,14 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
 	return true;
 }
 
-static bool unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
+static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
 				 const struct nf_nat_range *range,
 				 enum nf_nat_manip_type maniptype,
 				 const struct nf_conn *ct)
 {
 	/* Sorry: we can't help you; if it's not unique, we can't frob
 	   anything. */
-	return false;
+	return;
 }
 
 static bool
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 98ed78281ae..21c30426480 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -28,7 +28,8 @@
 
 #define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
 			 (1 << NF_INET_POST_ROUTING) | \
-			 (1 << NF_INET_LOCAL_OUT))
+			 (1 << NF_INET_LOCAL_OUT) | \
+			 (1 << NF_INET_LOCAL_IN))
 
 static const struct xt_table nat_table = {
 	.name		= "nat",
@@ -45,7 +46,8 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
 	enum ip_conntrack_info ctinfo;
 	const struct nf_nat_multi_range_compat *mr = par->targinfo;
 
-	NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
+	NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING ||
+		     par->hooknum == NF_INET_LOCAL_IN);
 
 	ct = nf_ct_get(skb, &ctinfo);
 
@@ -99,21 +101,20 @@ static int ipt_dnat_checkentry(const struct xt_tgchk_param *par)
 	return 0;
 }
 
-unsigned int
+static unsigned int
 alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
 {
 	/* Force range to this IP; let proto decide mapping for
 	   per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
-	   Use reply in case it's already been mangled (eg local packet).
 	*/
-	__be32 ip
-		= (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
-		   ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip
-		   : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
-	struct nf_nat_range range
-		= { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
-
-	pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip);
+	struct nf_nat_range range;
+
+	range.flags = 0;
+	pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
+		 HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ?
+		 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
+		 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
+
 	return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
 }
 
@@ -141,7 +142,7 @@ static struct xt_target ipt_snat_reg __read_mostly = {
 	.target		= ipt_snat_target,
 	.targetsize	= sizeof(struct nf_nat_multi_range_compat),
 	.table		= "nat",
-	.hooks		= 1 << NF_INET_POST_ROUTING,
+	.hooks		= (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN),
 	.checkentry	= ipt_snat_checkentry,
 	.family		= AF_INET,
 };
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 11b538deaae..e40cf7816fd 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -307,9 +307,16 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
 	exp->expectfn = ip_nat_sip_expected;
 
 	for (; port != 0; port++) {
+		int ret;
+
 		exp->tuple.dst.u.udp.port = htons(port);
-		if (nf_ct_expect_related(exp) == 0)
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			port = 0;
 			break;
+		}
 	}
 
 	if (port == 0)
@@ -480,13 +487,25 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
 	/* Try to get same pair of ports: if not, try to change them. */
 	for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
 	     port != 0; port += 2) {
+		int ret;
+
 		rtp_exp->tuple.dst.u.udp.port = htons(port);
-		if (nf_ct_expect_related(rtp_exp) != 0)
+		ret = nf_ct_expect_related(rtp_exp);
+		if (ret == -EBUSY)
 			continue;
+		else if (ret < 0) {
+			port = 0;
+			break;
+		}
 		rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
-		if (nf_ct_expect_related(rtcp_exp) == 0)
+		ret = nf_ct_expect_related(rtcp_exp);
+		if (ret == 0)
 			break;
-		nf_ct_unexpect_related(rtp_exp);
+		else if (ret != -EBUSY) {
+			nf_ct_unexpect_related(rtp_exp);
+			port = 0;
+			break;
+		}
 	}
 
 	if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 1679e2c0963..ee5f419d0a5 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -893,13 +893,15 @@ static void fast_csum(__sum16 *csum,
 	unsigned char s[4];
 
 	if (offset & 1) {
-		s[0] = s[2] = 0;
+		s[0] = ~0;
 		s[1] = ~*optr;
+		s[2] = 0;
 		s[3] = *nptr;
 	} else {
-		s[1] = s[3] = 0;
 		s[0] = ~*optr;
+		s[1] = ~0;
 		s[2] = *nptr;
+		s[3] = 0;
 	}
 
 	*csum = csum_fold(csum_partial(s, 4, ~csum_unfold(*csum)));
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index beb25819c9c..95481fee8bd 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -98,7 +98,7 @@ nf_nat_fn(unsigned int hooknum,
 		return NF_ACCEPT;
 
 	/* Don't try to NAT if this packet is not conntracked */
-	if (ct == &nf_conntrack_untracked)
+	if (nf_ct_is_untracked(ct))
 		return NF_ACCEPT;
 
 	nat = nfct_nat(ct);
@@ -131,13 +131,7 @@ nf_nat_fn(unsigned int hooknum,
 		if (!nf_nat_initialized(ct, maniptype)) {
 			unsigned int ret;
 
-			if (hooknum == NF_INET_LOCAL_IN)
-				/* LOCAL_IN hook doesn't have a chain!  */
-				ret = alloc_null_binding(ct, hooknum);
-			else
-				ret = nf_nat_rule_find(skb, hooknum, in, out,
-						       ct);
-
+			ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
 			if (ret != NF_ACCEPT)
 				return ret;
 		} else
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 3dc9914c1dc..4ae1f203f7c 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -252,6 +252,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
 	SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
 	SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
+	SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
 	SNMP_MIB_SENTINEL
 };
 
@@ -342,10 +343,12 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
 		   IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
 		   sysctl_ip_default_ttl);
 
+	BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
 	for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
-		seq_printf(seq, " %lu",
-			   snmp_fold_field((void __percpu **)net->mib.ip_statistics,
-					   snmp4_ipstats_list[i].entry));
+		seq_printf(seq, " %llu",
+			   snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
+					     snmp4_ipstats_list[i].entry,
+					     offsetof(struct ipstats_mib, syncp)));
 
 	icmp_put(seq);	/* RFC 2011 compatibility */
 	icmpmsg_put(seq);
@@ -431,9 +434,10 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
 
 	seq_puts(seq, "\nIpExt:");
 	for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
-		seq_printf(seq, " %lu",
-			   snmp_fold_field((void __percpu **)net->mib.ip_statistics,
-					   snmp4_ipextstats_list[i].entry));
+		seq_printf(seq, " %llu",
+			   snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
+					     snmp4_ipextstats_list[i].entry,
+					     offsetof(struct ipstats_mib, syncp)));
 
 	seq_putc(seq, '\n');
 	return 0;
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 542f22fc98b..9ae5c01cd0b 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -28,8 +28,7 @@
 #include <linux/spinlock.h>
 #include <net/protocol.h>
 
-const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp;
-static DEFINE_SPINLOCK(inet_proto_lock);
+const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
 
 /*
  *	Add a protocol handler to the hash tables
@@ -37,21 +36,12 @@ static DEFINE_SPINLOCK(inet_proto_lock);
 
 int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
-	int hash, ret;
+	int hash = protocol & (MAX_INET_PROTOS - 1);
 
-	hash = protocol & (MAX_INET_PROTOS - 1);
-
-	spin_lock_bh(&inet_proto_lock);
-	if (inet_protos[hash]) {
-		ret = -1;
-	} else {
-		inet_protos[hash] = prot;
-		ret = 0;
-	}
-	spin_unlock_bh(&inet_proto_lock);
-
-	return ret;
+	return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
+			NULL, prot) ? 0 : -1;
 }
+EXPORT_SYMBOL(inet_add_protocol);
 
 /*
  *	Remove a protocol from the hash tables.
@@ -59,23 +49,13 @@ int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
 
 int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
-	int hash, ret;
+	int ret, hash = protocol & (MAX_INET_PROTOS - 1);
 
-	hash = protocol & (MAX_INET_PROTOS - 1);
-
-	spin_lock_bh(&inet_proto_lock);
-	if (inet_protos[hash] == prot) {
-		inet_protos[hash] = NULL;
-		ret = 0;
-	} else {
-		ret = -1;
-	}
-	spin_unlock_bh(&inet_proto_lock);
+	ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash],
+		       prot, NULL) == prot) ? 0 : -1;
 
 	synchronize_net();
 
 	return ret;
 }
-
-EXPORT_SYMBOL(inet_add_protocol);
 EXPORT_SYMBOL(inet_del_protocol);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2c7a1639388..1f85ef28989 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -314,7 +314,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
 }
 
 static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
-			struct rtable *rt,
+			struct rtable **rtp,
 			unsigned int flags)
 {
 	struct inet_sock *inet = inet_sk(sk);
@@ -323,25 +323,27 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
 	struct sk_buff *skb;
 	unsigned int iphlen;
 	int err;
+	struct rtable *rt = *rtp;
 
-	if (length > rt->u.dst.dev->mtu) {
+	if (length > rt->dst.dev->mtu) {
 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
-			       rt->u.dst.dev->mtu);
+			       rt->dst.dev->mtu);
 		return -EMSGSIZE;
 	}
 	if (flags&MSG_PROBE)
 		goto out;
 
 	skb = sock_alloc_send_skb(sk,
-				  length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15,
+				  length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15,
 				  flags & MSG_DONTWAIT, &err);
 	if (skb == NULL)
 		goto error;
-	skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev));
+	skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev));
 
 	skb->priority = sk->sk_priority;
 	skb->mark = sk->sk_mark;
-	skb_dst_set(skb, dst_clone(&rt->u.dst));
+	skb_dst_set(skb, &rt->dst);
+	*rtp = NULL;
 
 	skb_reset_network_header(skb);
 	iph = ip_hdr(skb);
@@ -373,7 +375,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
 		iph->check   = 0;
 		iph->tot_len = htons(length);
 		if (!iph->id)
-			ip_select_ident(iph, &rt->u.dst, NULL);
+			ip_select_ident(iph, &rt->dst, NULL);
 
 		iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 	}
@@ -382,7 +384,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
 			skb_transport_header(skb))->type);
 
 	err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
-		      rt->u.dst.dev, dst_output);
+		      rt->dst.dev, dst_output);
 	if (err > 0)
 		err = net_xmit_errno(err);
 	if (err)
@@ -503,7 +505,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	ipc.addr = inet->inet_saddr;
 	ipc.opt = NULL;
-	ipc.shtx.flags = 0;
+	ipc.tx_flags = 0;
 	ipc.oif = sk->sk_bound_dev_if;
 
 	if (msg->msg_controllen) {
@@ -576,7 +578,7 @@ back_from_confirm:
 
 	if (inet->hdrincl)
 		err = raw_send_hdrinc(sk, msg->msg_iov, len,
-					rt, msg->msg_flags);
+					&rt, msg->msg_flags);
 
 	 else {
 		if (!ipc.addr)
@@ -604,7 +606,7 @@ out:
 	return len;
 
 do_confirm:
-	dst_confirm(&rt->u.dst);
+	dst_confirm(&rt->dst);
 	if (!(msg->msg_flags & MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 560acc677ce..987bf9adb31 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -159,7 +159,6 @@ static struct dst_ops ipv4_dst_ops = {
 	.link_failure =		ipv4_link_failure,
 	.update_pmtu =		ip_rt_update_pmtu,
 	.local_out =		__ip_local_out,
-	.entries =		ATOMIC_INIT(0),
 };
 
 #define ECN_OR_COST(class)	TC_PRIO_##class
@@ -199,7 +198,7 @@ const __u8 ip_tos2prio[16] = {
  */
 
 struct rt_hash_bucket {
-	struct rtable	*chain;
+	struct rtable __rcu	*chain;
 };
 
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
@@ -253,8 +252,7 @@ static unsigned			rt_hash_mask __read_mostly;
 static unsigned int		rt_hash_log  __read_mostly;
 
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
-#define RT_CACHE_STAT_INC(field) \
-	(__raw_get_cpu_var(rt_cache_stat).field++)
+#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 
 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 				   int genid)
@@ -282,15 +280,15 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
 	struct rtable *r = NULL;
 
 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
-		if (!rt_hash_table[st->bucket].chain)
+		if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 			continue;
 		rcu_read_lock_bh();
 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 		while (r) {
-			if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
+			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 			    r->rt_genid == st->genid)
 				return r;
-			r = rcu_dereference_bh(r->u.dst.rt_next);
+			r = rcu_dereference_bh(r->dst.rt_next);
 		}
 		rcu_read_unlock_bh();
 	}
@@ -302,17 +300,17 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 {
 	struct rt_cache_iter_state *st = seq->private;
 
-	r = r->u.dst.rt_next;
+	r = rcu_dereference_bh(r->dst.rt_next);
 	while (!r) {
 		rcu_read_unlock_bh();
 		do {
 			if (--st->bucket < 0)
 				return NULL;
-		} while (!rt_hash_table[st->bucket].chain);
+		} while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 		rcu_read_lock_bh();
-		r = rt_hash_table[st->bucket].chain;
+		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 	}
-	return rcu_dereference_bh(r);
+	return r;
 }
 
 static struct rtable *rt_cache_get_next(struct seq_file *seq,
@@ -320,7 +318,7 @@ static struct rtable *rt_cache_get_next(struct seq_file *seq,
 {
 	struct rt_cache_iter_state *st = seq->private;
 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
-		if (dev_net(r->u.dst.dev) != seq_file_net(seq))
+		if (dev_net(r->dst.dev) != seq_file_net(seq))
 			continue;
 		if (r->rt_genid == st->genid)
 			break;
@@ -378,19 +376,19 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
 
 		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
-			r->u.dst.dev ? r->u.dst.dev->name : "*",
+			r->dst.dev ? r->dst.dev->name : "*",
 			(__force u32)r->rt_dst,
 			(__force u32)r->rt_gateway,
-			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
-			r->u.dst.__use, 0, (__force u32)r->rt_src,
-			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
-			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
-			dst_metric(&r->u.dst, RTAX_WINDOW),
-			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
-			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
+			r->rt_flags, atomic_read(&r->dst.__refcnt),
+			r->dst.__use, 0, (__force u32)r->rt_src,
+			(dst_metric(&r->dst, RTAX_ADVMSS) ?
+			     (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
+			dst_metric(&r->dst, RTAX_WINDOW),
+			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
+			      dst_metric(&r->dst, RTAX_RTTVAR)),
 			r->fl.fl4_tos,
-			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
-			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
+			r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
+			r->dst.hh ? (r->dst.hh->hh_output ==
 				       dev_queue_xmit) : 0,
 			r->rt_spec_dst, &len);
 
@@ -467,7 +465,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 
 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
-		   atomic_read(&ipv4_dst_ops.entries),
+		   dst_entries_get_slow(&ipv4_dst_ops),
 		   st->in_hit,
 		   st->in_slow_tot,
 		   st->in_slow_mc,
@@ -609,13 +607,13 @@ static inline int ip_rt_proc_init(void)
 
 static inline void rt_free(struct rtable *rt)
 {
-	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 }
 
 static inline void rt_drop(struct rtable *rt)
 {
 	ip_rt_put(rt);
-	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 }
 
 static inline int rt_fast_clean(struct rtable *rth)
@@ -623,13 +621,13 @@ static inline int rt_fast_clean(struct rtable *rth)
 	/* Kill broadcast/multicast entries very aggresively, if they
 	   collide in hash table with more useful entries */
 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
-		rth->fl.iif && rth->u.dst.rt_next;
+		rth->fl.iif && rth->dst.rt_next;
 }
 
 static inline int rt_valuable(struct rtable *rth)
 {
 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
-		rth->u.dst.expires;
+		rth->dst.expires;
 }
 
 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -637,15 +635,15 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
 	unsigned long age;
 	int ret = 0;
 
-	if (atomic_read(&rth->u.dst.__refcnt))
+	if (atomic_read(&rth->dst.__refcnt))
 		goto out;
 
 	ret = 1;
-	if (rth->u.dst.expires &&
-	    time_after_eq(jiffies, rth->u.dst.expires))
+	if (rth->dst.expires &&
+	    time_after_eq(jiffies, rth->dst.expires))
 		goto out;
 
-	age = jiffies - rth->u.dst.lastuse;
+	age = jiffies - rth->dst.lastuse;
 	ret = 0;
 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 	    (age <= tmo2 && rt_valuable(rth)))
@@ -661,7 +659,7 @@ out:	return ret;
  */
 static inline u32 rt_score(struct rtable *rt)
 {
-	u32 score = jiffies - rt->u.dst.lastuse;
+	u32 score = jiffies - rt->dst.lastuse;
 
 	score = ~score & ~(3<<30);
 
@@ -701,12 +699,12 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 
 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 {
-	return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
+	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 }
 
 static inline int rt_is_expired(struct rtable *rth)
 {
-	return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
+	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 }
 
 /*
@@ -723,19 +721,23 @@ static void rt_do_flush(int process_context)
 	for (i = 0; i <= rt_hash_mask; i++) {
 		if (process_context && need_resched())
 			cond_resched();
-		rth = rt_hash_table[i].chain;
+		rth = rcu_dereference_raw(rt_hash_table[i].chain);
 		if (!rth)
 			continue;
 
 		spin_lock_bh(rt_hash_lock_addr(i));
 #ifdef CONFIG_NET_NS
 		{
-		struct rtable ** prev, * p;
+		struct rtable __rcu **prev;
+		struct rtable *p;
 
-		rth = rt_hash_table[i].chain;
+		rth = rcu_dereference_protected(rt_hash_table[i].chain,
+			lockdep_is_held(rt_hash_lock_addr(i)));
 
 		/* defer releasing the head of the list after spin_unlock */
-		for (tail = rth; tail; tail = tail->u.dst.rt_next)
+		for (tail = rth; tail;
+		     tail = rcu_dereference_protected(tail->dst.rt_next,
+				lockdep_is_held(rt_hash_lock_addr(i))))
 			if (!rt_is_expired(tail))
 				break;
 		if (rth != tail)
@@ -743,10 +745,14 @@ static void rt_do_flush(int process_context)
 
 		/* call rt_free on entries after the tail requiring flush */
 		prev = &rt_hash_table[i].chain;
-		for (p = *prev; p; p = next) {
-			next = p->u.dst.rt_next;
+		for (p = rcu_dereference_protected(*prev,
+				lockdep_is_held(rt_hash_lock_addr(i)));
+		     p != NULL;
+		     p = next) {
+			next = rcu_dereference_protected(p->dst.rt_next,
+				lockdep_is_held(rt_hash_lock_addr(i)));
 			if (!rt_is_expired(p)) {
-				prev = &p->u.dst.rt_next;
+				prev = &p->dst.rt_next;
 			} else {
 				*prev = next;
 				rt_free(p);
@@ -754,14 +760,15 @@ static void rt_do_flush(int process_context)
 		}
 		}
 #else
-		rth = rt_hash_table[i].chain;
-		rt_hash_table[i].chain = NULL;
+		rth = rcu_dereference_protected(rt_hash_table[i].chain,
+			lockdep_is_held(rt_hash_lock_addr(i)));
+		rcu_assign_pointer(rt_hash_table[i].chain, NULL);
 		tail = NULL;
 #endif
 		spin_unlock_bh(rt_hash_lock_addr(i));
 
 		for (; rth != tail; rth = next) {
-			next = rth->u.dst.rt_next;
+			next = rcu_dereference_protected(rth->dst.rt_next, 1);
 			rt_free(rth);
 		}
 	}
@@ -792,7 +799,7 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
 	while (aux != rth) {
 		if (compare_hash_inputs(&aux->fl, &rth->fl))
 			return 0;
-		aux = aux->u.dst.rt_next;
+		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 	}
 	return ONE;
 }
@@ -801,7 +808,8 @@ static void rt_check_expire(void)
 {
 	static unsigned int rover;
 	unsigned int i = rover, goal;
-	struct rtable *rth, **rthp;
+	struct rtable *rth;
+	struct rtable __rcu **rthp;
 	unsigned long samples = 0;
 	unsigned long sum = 0, sum2 = 0;
 	unsigned long delta;
@@ -827,23 +835,24 @@ static void rt_check_expire(void)
 
 		samples++;
 
-		if (*rthp == NULL)
+		if (rcu_dereference_raw(*rthp) == NULL)
 			continue;
 		length = 0;
 		spin_lock_bh(rt_hash_lock_addr(i));
-		while ((rth = *rthp) != NULL) {
-			prefetch(rth->u.dst.rt_next);
+		while ((rth = rcu_dereference_protected(*rthp,
+					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
+			prefetch(rth->dst.rt_next);
 			if (rt_is_expired(rth)) {
-				*rthp = rth->u.dst.rt_next;
+				*rthp = rth->dst.rt_next;
 				rt_free(rth);
 				continue;
 			}
-			if (rth->u.dst.expires) {
+			if (rth->dst.expires) {
 				/* Entry is expired even if it is in use */
-				if (time_before_eq(jiffies, rth->u.dst.expires)) {
+				if (time_before_eq(jiffies, rth->dst.expires)) {
 nofree:
 					tmo >>= 1;
-					rthp = &rth->u.dst.rt_next;
+					rthp = &rth->dst.rt_next;
 					/*
 					 * We only count entries on
 					 * a chain with equal hash inputs once
@@ -859,7 +868,7 @@ nofree:
 				goto nofree;
 
 			/* Cleanup aged off entries. */
-			*rthp = rth->u.dst.rt_next;
+			*rthp = rth->dst.rt_next;
 			rt_free(rth);
 		}
 		spin_unlock_bh(rt_hash_lock_addr(i));
@@ -943,9 +952,11 @@ static int rt_garbage_collect(struct dst_ops *ops)
 	static unsigned long last_gc;
 	static int rover;
 	static int equilibrium;
-	struct rtable *rth, **rthp;
+	struct rtable *rth;
+	struct rtable __rcu **rthp;
 	unsigned long now = jiffies;
 	int goal;
+	int entries = dst_entries_get_fast(&ipv4_dst_ops);
 
 	/*
 	 * Garbage collection is pretty expensive,
@@ -955,28 +966,28 @@ static int rt_garbage_collect(struct dst_ops *ops)
 	RT_CACHE_STAT_INC(gc_total);
 
 	if (now - last_gc < ip_rt_gc_min_interval &&
-	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
+	    entries < ip_rt_max_size) {
 		RT_CACHE_STAT_INC(gc_ignored);
 		goto out;
 	}
 
+	entries = dst_entries_get_slow(&ipv4_dst_ops);
 	/* Calculate number of entries, which we want to expire now. */
-	goal = atomic_read(&ipv4_dst_ops.entries) -
-		(ip_rt_gc_elasticity << rt_hash_log);
+	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 	if (goal <= 0) {
 		if (equilibrium < ipv4_dst_ops.gc_thresh)
 			equilibrium = ipv4_dst_ops.gc_thresh;
-		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+		goal = entries - equilibrium;
 		if (goal > 0) {
 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+			goal = entries - equilibrium;
 		}
 	} else {
 		/* We are in dangerous area. Try to reduce cache really
 		 * aggressively.
 		 */
 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
+		equilibrium = entries - goal;
 	}
 
 	if (now - last_gc >= ip_rt_gc_min_interval)
@@ -996,14 +1007,15 @@ static int rt_garbage_collect(struct dst_ops *ops)
 			k = (k + 1) & rt_hash_mask;
 			rthp = &rt_hash_table[k].chain;
 			spin_lock_bh(rt_hash_lock_addr(k));
-			while ((rth = *rthp) != NULL) {
+			while ((rth = rcu_dereference_protected(*rthp,
+					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 				if (!rt_is_expired(rth) &&
 					!rt_may_expire(rth, tmo, expire)) {
 					tmo >>= 1;
-					rthp = &rth->u.dst.rt_next;
+					rthp = &rth->dst.rt_next;
 					continue;
 				}
-				*rthp = rth->u.dst.rt_next;
+				*rthp = rth->dst.rt_next;
 				rt_free(rth);
 				goal--;
 			}
@@ -1033,14 +1045,16 @@ static int rt_garbage_collect(struct dst_ops *ops)
 		expire >>= 1;
 #if RT_CACHE_DEBUG >= 2
 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
-				atomic_read(&ipv4_dst_ops.entries), goal, i);
+				dst_entries_get_fast(&ipv4_dst_ops), goal, i);
 #endif
 
-		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 			goto out;
 	} while (!in_softirq() && time_before_eq(jiffies, now));
 
-	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
+		goto out;
+	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
 		goto out;
 	if (net_ratelimit())
 		printk(KERN_WARNING "dst cache overflow\n");
@@ -1050,11 +1064,12 @@ static int rt_garbage_collect(struct dst_ops *ops)
 work_done:
 	expire += ip_rt_gc_min_interval;
 	if (expire > ip_rt_gc_timeout ||
-	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
+	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
+	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
 		expire = ip_rt_gc_timeout;
 #if RT_CACHE_DEBUG >= 2
 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
-			atomic_read(&ipv4_dst_ops.entries), goal, rover);
+			dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
 #endif
 out:	return 0;
 }
@@ -1069,7 +1084,7 @@ static int slow_chain_length(const struct rtable *head)
 
 	while (rth) {
 		length += has_noalias(head, rth);
-		rth = rth->u.dst.rt_next;
+		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
 	}
 	return length >> FRACT_BITS;
 }
@@ -1077,9 +1092,9 @@ static int slow_chain_length(const struct rtable *head)
 static int rt_intern_hash(unsigned hash, struct rtable *rt,
 			  struct rtable **rp, struct sk_buff *skb, int ifindex)
 {
-	struct rtable	*rth, **rthp;
+	struct rtable	*rth, *cand;
+	struct rtable __rcu **rthp, **candp;
 	unsigned long	now;
-	struct rtable *cand, **candp;
 	u32 		min_score;
 	int		chain_length;
 	int attempts = !in_softirq();
@@ -1091,7 +1106,7 @@ restart:
 	candp = NULL;
 	now = jiffies;
 
-	if (!rt_caching(dev_net(rt->u.dst.dev))) {
+	if (!rt_caching(dev_net(rt->dst.dev))) {
 		/*
 		 * If we're not caching, just tell the caller we
 		 * were successful and don't touch the route.  The
@@ -1103,44 +1118,45 @@ restart:
 		 * Note that we do rt_free on this new route entry, so that
 		 * once its refcount hits zero, we are still able to reap it
 		 * (Thanks Alexey)
-		 * Note also the rt_free uses call_rcu.  We don't actually
-		 * need rcu protection here, this is just our path to get
-		 * on the route gc list.
+		 * Note: To avoid expensive rcu stuff for this uncached dst,
+		 * we set DST_NOCACHE so that dst_release() can free dst without
+		 * waiting a grace period.
 		 */
 
+		rt->dst.flags |= DST_NOCACHE;
 		if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
-			int err = arp_bind_neighbour(&rt->u.dst);
+			int err = arp_bind_neighbour(&rt->dst);
 			if (err) {
 				if (net_ratelimit())
 					printk(KERN_WARNING
 					    "Neighbour table failure & not caching routes.\n");
-				rt_drop(rt);
+				ip_rt_put(rt);
 				return err;
 			}
 		}
 
-		rt_free(rt);
 		goto skip_hashing;
 	}
 
 	rthp = &rt_hash_table[hash].chain;
 
 	spin_lock_bh(rt_hash_lock_addr(hash));
-	while ((rth = *rthp) != NULL) {
+	while ((rth = rcu_dereference_protected(*rthp,
+			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
 		if (rt_is_expired(rth)) {
-			*rthp = rth->u.dst.rt_next;
+			*rthp = rth->dst.rt_next;
 			rt_free(rth);
 			continue;
 		}
 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
 			/* Put it first */
-			*rthp = rth->u.dst.rt_next;
+			*rthp = rth->dst.rt_next;
 			/*
 			 * Since lookup is lockfree, the deletion
 			 * must be visible to another weakly ordered CPU before
 			 * the insertion at the start of the hash chain.
 			 */
-			rcu_assign_pointer(rth->u.dst.rt_next,
+			rcu_assign_pointer(rth->dst.rt_next,
 					   rt_hash_table[hash].chain);
 			/*
 			 * Since lookup is lockfree, the update writes
@@ -1148,18 +1164,18 @@ restart:
 			 */
 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 
-			dst_use(&rth->u.dst, now);
+			dst_use(&rth->dst, now);
 			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			rt_drop(rt);
 			if (rp)
 				*rp = rth;
 			else
-				skb_dst_set(skb, &rth->u.dst);
+				skb_dst_set(skb, &rth->dst);
 			return 0;
 		}
 
-		if (!atomic_read(&rth->u.dst.__refcnt)) {
+		if (!atomic_read(&rth->dst.__refcnt)) {
 			u32 score = rt_score(rth);
 
 			if (score <= min_score) {
@@ -1171,7 +1187,7 @@ restart:
 
 		chain_length++;
 
-		rthp = &rth->u.dst.rt_next;
+		rthp = &rth->dst.rt_next;
 	}
 
 	if (cand) {
@@ -1182,17 +1198,17 @@ restart:
 		 * only 2 entries per bucket. We will see.
 		 */
 		if (chain_length > ip_rt_gc_elasticity) {
-			*candp = cand->u.dst.rt_next;
+			*candp = cand->dst.rt_next;
 			rt_free(cand);
 		}
 	} else {
 		if (chain_length > rt_chain_length_max &&
 		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
-			struct net *net = dev_net(rt->u.dst.dev);
+			struct net *net = dev_net(rt->dst.dev);
 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
 			if (!rt_caching(net)) {
 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
-					rt->u.dst.dev->name, num);
+					rt->dst.dev->name, num);
 			}
 			rt_emergency_hash_rebuild(net);
 			spin_unlock_bh(rt_hash_lock_addr(hash));
@@ -1207,7 +1223,7 @@ restart:
 	   route or unicast forwarding path.
 	 */
 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
-		int err = arp_bind_neighbour(&rt->u.dst);
+		int err = arp_bind_neighbour(&rt->dst);
 		if (err) {
 			spin_unlock_bh(rt_hash_lock_addr(hash));
 
@@ -1232,20 +1248,20 @@ restart:
 			}
 
 			if (net_ratelimit())
-				printk(KERN_WARNING "Neighbour table overflow.\n");
+				printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
 			rt_drop(rt);
 			return -ENOBUFS;
 		}
 	}
 
-	rt->u.dst.rt_next = rt_hash_table[hash].chain;
+	rt->dst.rt_next = rt_hash_table[hash].chain;
 
 #if RT_CACHE_DEBUG >= 2
-	if (rt->u.dst.rt_next) {
+	if (rt->dst.rt_next) {
 		struct rtable *trt;
 		printk(KERN_DEBUG "rt_cache @%02x: %pI4",
 		       hash, &rt->rt_dst);
-		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
+		for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
 			printk(" . %pI4", &trt->rt_dst);
 		printk("\n");
 	}
@@ -1263,24 +1279,17 @@ skip_hashing:
 	if (rp)
 		*rp = rt;
 	else
-		skb_dst_set(skb, &rt->u.dst);
+		skb_dst_set(skb, &rt->dst);
 	return 0;
 }
 
 void rt_bind_peer(struct rtable *rt, int create)
 {
-	static DEFINE_SPINLOCK(rt_peer_lock);
 	struct inet_peer *peer;
 
 	peer = inet_getpeer(rt->rt_dst, create);
 
-	spin_lock_bh(&rt_peer_lock);
-	if (rt->peer == NULL) {
-		rt->peer = peer;
-		peer = NULL;
-	}
-	spin_unlock_bh(&rt_peer_lock);
-	if (peer)
+	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
 		inet_putpeer(peer);
 }
 
@@ -1325,31 +1334,36 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 
 	ip_select_fb_ident(iph);
 }
+EXPORT_SYMBOL(__ip_select_ident);
 
 static void rt_del(unsigned hash, struct rtable *rt)
 {
-	struct rtable **rthp, *aux;
+	struct rtable __rcu **rthp;
+	struct rtable *aux;
 
 	rthp = &rt_hash_table[hash].chain;
 	spin_lock_bh(rt_hash_lock_addr(hash));
 	ip_rt_put(rt);
-	while ((aux = *rthp) != NULL) {
+	while ((aux = rcu_dereference_protected(*rthp,
+			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
 		if (aux == rt || rt_is_expired(aux)) {
-			*rthp = aux->u.dst.rt_next;
+			*rthp = aux->dst.rt_next;
 			rt_free(aux);
 			continue;
 		}
-		rthp = &aux->u.dst.rt_next;
+		rthp = &aux->dst.rt_next;
 	}
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
+/* called in rcu_read_lock() section */
 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 		    __be32 saddr, struct net_device *dev)
 {
 	int i, k;
-	struct in_device *in_dev = in_dev_get(dev);
-	struct rtable *rth, **rthp;
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	struct rtable *rth;
+	struct rtable __rcu **rthp;
 	__be32  skeys[2] = { saddr, 0 };
 	int  ikeys[2] = { dev->ifindex, 0 };
 	struct netevent_redirect netevent;
@@ -1382,9 +1396,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
 						rt_genid(net));
 
-			rthp=&rt_hash_table[hash].chain;
+			rthp = &rt_hash_table[hash].chain;
 
-			rcu_read_lock();
 			while ((rth = rcu_dereference(*rthp)) != NULL) {
 				struct rtable *rt;
 
@@ -1393,44 +1406,42 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 				    rth->fl.oif != ikeys[k] ||
 				    rth->fl.iif != 0 ||
 				    rt_is_expired(rth) ||
-				    !net_eq(dev_net(rth->u.dst.dev), net)) {
-					rthp = &rth->u.dst.rt_next;
+				    !net_eq(dev_net(rth->dst.dev), net)) {
+					rthp = &rth->dst.rt_next;
 					continue;
 				}
 
 				if (rth->rt_dst != daddr ||
 				    rth->rt_src != saddr ||
-				    rth->u.dst.error ||
+				    rth->dst.error ||
 				    rth->rt_gateway != old_gw ||
-				    rth->u.dst.dev != dev)
+				    rth->dst.dev != dev)
 					break;
 
-				dst_hold(&rth->u.dst);
-				rcu_read_unlock();
+				dst_hold(&rth->dst);
 
 				rt = dst_alloc(&ipv4_dst_ops);
 				if (rt == NULL) {
 					ip_rt_put(rth);
-					in_dev_put(in_dev);
 					return;
 				}
 
 				/* Copy all the information. */
 				*rt = *rth;
-				rt->u.dst.__use		= 1;
-				atomic_set(&rt->u.dst.__refcnt, 1);
-				rt->u.dst.child		= NULL;
-				if (rt->u.dst.dev)
-					dev_hold(rt->u.dst.dev);
+				rt->dst.__use		= 1;
+				atomic_set(&rt->dst.__refcnt, 1);
+				rt->dst.child		= NULL;
+				if (rt->dst.dev)
+					dev_hold(rt->dst.dev);
 				if (rt->idev)
 					in_dev_hold(rt->idev);
-				rt->u.dst.obsolete	= -1;
-				rt->u.dst.lastuse	= jiffies;
-				rt->u.dst.path		= &rt->u.dst;
-				rt->u.dst.neighbour	= NULL;
-				rt->u.dst.hh		= NULL;
+				rt->dst.obsolete	= -1;
+				rt->dst.lastuse	= jiffies;
+				rt->dst.path		= &rt->dst;
+				rt->dst.neighbour	= NULL;
+				rt->dst.hh		= NULL;
 #ifdef CONFIG_XFRM
-				rt->u.dst.xfrm		= NULL;
+				rt->dst.xfrm		= NULL;
 #endif
 				rt->rt_genid		= rt_genid(net);
 				rt->rt_flags		|= RTCF_REDIRECTED;
@@ -1439,23 +1450,23 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 				rt->rt_gateway		= new_gw;
 
 				/* Redirect received -> path was valid */
-				dst_confirm(&rth->u.dst);
+				dst_confirm(&rth->dst);
 
 				if (rt->peer)
 					atomic_inc(&rt->peer->refcnt);
 
-				if (arp_bind_neighbour(&rt->u.dst) ||
-				    !(rt->u.dst.neighbour->nud_state &
+				if (arp_bind_neighbour(&rt->dst) ||
+				    !(rt->dst.neighbour->nud_state &
 					    NUD_VALID)) {
-					if (rt->u.dst.neighbour)
-						neigh_event_send(rt->u.dst.neighbour, NULL);
+					if (rt->dst.neighbour)
+						neigh_event_send(rt->dst.neighbour, NULL);
 					ip_rt_put(rth);
 					rt_drop(rt);
 					goto do_next;
 				}
 
-				netevent.old = &rth->u.dst;
-				netevent.new = &rt->u.dst;
+				netevent.old = &rth->dst;
+				netevent.new = &rt->dst;
 				call_netevent_notifiers(NETEVENT_REDIRECT,
 							&netevent);
 
@@ -1464,12 +1475,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 					ip_rt_put(rt);
 				goto do_next;
 			}
-			rcu_read_unlock();
 		do_next:
 			;
 		}
 	}
-	in_dev_put(in_dev);
 	return;
 
 reject_redirect:
@@ -1480,7 +1489,7 @@ reject_redirect:
 		       &old_gw, dev->name, &new_gw,
 		       &saddr, &daddr);
 #endif
-	in_dev_put(in_dev);
+	;
 }
 
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -1493,8 +1502,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 			ip_rt_put(rt);
 			ret = NULL;
 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
-			   (rt->u.dst.expires &&
-			    time_after_eq(jiffies, rt->u.dst.expires))) {
+			   (rt->dst.expires &&
+			    time_after_eq(jiffies, rt->dst.expires))) {
 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
 						rt->fl.oif,
 						rt_genid(dev_net(dst->dev)));
@@ -1532,7 +1541,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 	int log_martians;
 
 	rcu_read_lock();
-	in_dev = __in_dev_get_rcu(rt->u.dst.dev);
+	in_dev = __in_dev_get_rcu(rt->dst.dev);
 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 		rcu_read_unlock();
 		return;
@@ -1543,30 +1552,30 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 	/* No redirected packets during ip_rt_redirect_silence;
 	 * reset the algorithm.
 	 */
-	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
-		rt->u.dst.rate_tokens = 0;
+	if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
+		rt->dst.rate_tokens = 0;
 
 	/* Too many ignored redirects; do not send anything
-	 * set u.dst.rate_last to the last seen redirected packet.
+	 * set dst.rate_last to the last seen redirected packet.
 	 */
-	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
-		rt->u.dst.rate_last = jiffies;
+	if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
+		rt->dst.rate_last = jiffies;
 		return;
 	}
 
 	/* Check for load limit; set rate_last to the latest sent
 	 * redirect.
 	 */
-	if (rt->u.dst.rate_tokens == 0 ||
+	if (rt->dst.rate_tokens == 0 ||
 	    time_after(jiffies,
-		       (rt->u.dst.rate_last +
-			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
+		       (rt->dst.rate_last +
+			(ip_rt_redirect_load << rt->dst.rate_tokens)))) {
 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
-		rt->u.dst.rate_last = jiffies;
-		++rt->u.dst.rate_tokens;
+		rt->dst.rate_last = jiffies;
+		++rt->dst.rate_tokens;
 #ifdef CONFIG_IP_ROUTE_VERBOSE
 		if (log_martians &&
-		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
+		    rt->dst.rate_tokens == ip_rt_redirect_number &&
 		    net_ratelimit())
 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
 				&rt->rt_src, rt->rt_iif,
@@ -1581,7 +1590,7 @@ static int ip_error(struct sk_buff *skb)
 	unsigned long now;
 	int code;
 
-	switch (rt->u.dst.error) {
+	switch (rt->dst.error) {
 		case EINVAL:
 		default:
 			goto out;
@@ -1590,7 +1599,7 @@ static int ip_error(struct sk_buff *skb)
 			break;
 		case ENETUNREACH:
 			code = ICMP_NET_UNREACH;
-			IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
+			IP_INC_STATS_BH(dev_net(rt->dst.dev),
 					IPSTATS_MIB_INNOROUTES);
 			break;
 		case EACCES:
@@ -1599,12 +1608,12 @@ static int ip_error(struct sk_buff *skb)
 	}
 
 	now = jiffies;
-	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
-	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
-		rt->u.dst.rate_tokens = ip_rt_error_burst;
-	rt->u.dst.rate_last = now;
-	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
-		rt->u.dst.rate_tokens -= ip_rt_error_cost;
+	rt->dst.rate_tokens += now - rt->dst.rate_last;
+	if (rt->dst.rate_tokens > ip_rt_error_burst)
+		rt->dst.rate_tokens = ip_rt_error_burst;
+	rt->dst.rate_last = now;
+	if (rt->dst.rate_tokens >= ip_rt_error_cost) {
+		rt->dst.rate_tokens -= ip_rt_error_cost;
 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 	}
 
@@ -1649,7 +1658,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
 
 			rcu_read_lock();
 			for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
-			     rth = rcu_dereference(rth->u.dst.rt_next)) {
+			     rth = rcu_dereference(rth->dst.rt_next)) {
 				unsigned short mtu = new_mtu;
 
 				if (rth->fl.fl4_dst != daddr ||
@@ -1658,8 +1667,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
 				    rth->rt_src != iph->saddr ||
 				    rth->fl.oif != ikeys[k] ||
 				    rth->fl.iif != 0 ||
-				    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
-				    !net_eq(dev_net(rth->u.dst.dev), net) ||
+				    dst_metric_locked(&rth->dst, RTAX_MTU) ||
+				    !net_eq(dev_net(rth->dst.dev), net) ||
 				    rt_is_expired(rth))
 					continue;
 
@@ -1667,22 +1676,22 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
 
 					/* BSD 4.2 compatibility hack :-( */
 					if (mtu == 0 &&
-					    old_mtu >= dst_mtu(&rth->u.dst) &&
+					    old_mtu >= dst_mtu(&rth->dst) &&
 					    old_mtu >= 68 + (iph->ihl << 2))
 						old_mtu -= iph->ihl << 2;
 
 					mtu = guess_mtu(old_mtu);
 				}
-				if (mtu <= dst_mtu(&rth->u.dst)) {
-					if (mtu < dst_mtu(&rth->u.dst)) {
-						dst_confirm(&rth->u.dst);
+				if (mtu <= dst_mtu(&rth->dst)) {
+					if (mtu < dst_mtu(&rth->dst)) {
+						dst_confirm(&rth->dst);
 						if (mtu < ip_rt_min_pmtu) {
 							mtu = ip_rt_min_pmtu;
-							rth->u.dst.metrics[RTAX_LOCK-1] |=
+							rth->dst.metrics[RTAX_LOCK-1] |=
 								(1 << RTAX_MTU);
 						}
-						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
-						dst_set_expires(&rth->u.dst,
+						rth->dst.metrics[RTAX_MTU-1] = mtu;
+						dst_set_expires(&rth->dst,
 							ip_rt_mtu_expires);
 					}
 					est_mtu = mtu;
@@ -1755,7 +1764,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
 
 	rt = skb_rtable(skb);
 	if (rt)
-		dst_set_expires(&rt->u.dst, 0);
+		dst_set_expires(&rt->dst, 0);
 }
 
 static int ip_rt_bug(struct sk_buff *skb)
@@ -1783,22 +1792,25 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
 
 	if (rt->fl.iif == 0)
 		src = rt->rt_src;
-	else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
-		src = FIB_RES_PREFSRC(res);
-		fib_res_put(&res);
-	} else
-		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
+	else {
+		rcu_read_lock();
+		if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
+			src = FIB_RES_PREFSRC(res);
+		else
+			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
 					RT_SCOPE_UNIVERSE);
+		rcu_read_unlock();
+	}
 	memcpy(addr, &src, 4);
 }
 
 #ifdef CONFIG_NET_CLS_ROUTE
 static void set_class_tag(struct rtable *rt, u32 tag)
 {
-	if (!(rt->u.dst.tclassid & 0xFFFF))
-		rt->u.dst.tclassid |= tag & 0xFFFF;
-	if (!(rt->u.dst.tclassid & 0xFFFF0000))
-		rt->u.dst.tclassid |= tag & 0xFFFF0000;
+	if (!(rt->dst.tclassid & 0xFFFF))
+		rt->dst.tclassid |= tag & 0xFFFF;
+	if (!(rt->dst.tclassid & 0xFFFF0000))
+		rt->dst.tclassid |= tag & 0xFFFF0000;
 }
 #endif
 
@@ -1810,30 +1822,30 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
 		if (FIB_RES_GW(*res) &&
 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
 			rt->rt_gateway = FIB_RES_GW(*res);
-		memcpy(rt->u.dst.metrics, fi->fib_metrics,
-		       sizeof(rt->u.dst.metrics));
+		memcpy(rt->dst.metrics, fi->fib_metrics,
+		       sizeof(rt->dst.metrics));
 		if (fi->fib_mtu == 0) {
-			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
-			if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
+			rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
+			if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
 			    rt->rt_gateway != rt->rt_dst &&
-			    rt->u.dst.dev->mtu > 576)
-				rt->u.dst.metrics[RTAX_MTU-1] = 576;
+			    rt->dst.dev->mtu > 576)
+				rt->dst.metrics[RTAX_MTU-1] = 576;
 		}
 #ifdef CONFIG_NET_CLS_ROUTE
-		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
+		rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
 #endif
 	} else
-		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
-
-	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
-		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
-	if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
-		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
-	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
-		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
+		rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
+
+	if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
+		rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
+	if (dst_mtu(&rt->dst) > IP_MAX_MTU)
+		rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
+	if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
+		rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
 				       ip_rt_min_advmss);
-	if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
-		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
+	if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
+		rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
 
 #ifdef CONFIG_NET_CLS_ROUTE
 #ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -1844,14 +1856,16 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
 	rt->rt_type = res->type;
 }
 
+/* called in rcu_read_lock() section */
 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 				u8 tos, struct net_device *dev, int our)
 {
-	unsigned hash;
+	unsigned int hash;
 	struct rtable *rth;
 	__be32 spec_dst;
-	struct in_device *in_dev = in_dev_get(dev);
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
 	u32 itag = 0;
+	int err;
 
 	/* Primary sanity checks. */
 
@@ -1866,21 +1880,23 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		if (!ipv4_is_local_multicast(daddr))
 			goto e_inval;
 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
-	} else if (fib_validate_source(saddr, 0, tos, 0,
-					dev, &spec_dst, &itag, 0) < 0)
-		goto e_inval;
-
+	} else {
+		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
+					  &itag, 0);
+		if (err < 0)
+			goto e_err;
+	}
 	rth = dst_alloc(&ipv4_dst_ops);
 	if (!rth)
 		goto e_nobufs;
 
-	rth->u.dst.output = ip_rt_bug;
-	rth->u.dst.obsolete = -1;
+	rth->dst.output = ip_rt_bug;
+	rth->dst.obsolete = -1;
 
-	atomic_set(&rth->u.dst.__refcnt, 1);
-	rth->u.dst.flags= DST_HOST;
+	atomic_set(&rth->dst.__refcnt, 1);
+	rth->dst.flags= DST_HOST;
 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
-		rth->u.dst.flags |= DST_NOPOLICY;
+		rth->dst.flags |= DST_NOPOLICY;
 	rth->fl.fl4_dst	= daddr;
 	rth->rt_dst	= daddr;
 	rth->fl.fl4_tos	= tos;
@@ -1888,13 +1904,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->fl.fl4_src	= saddr;
 	rth->rt_src	= saddr;
 #ifdef CONFIG_NET_CLS_ROUTE
-	rth->u.dst.tclassid = itag;
+	rth->dst.tclassid = itag;
 #endif
 	rth->rt_iif	=
 	rth->fl.iif	= dev->ifindex;
-	rth->u.dst.dev	= init_net.loopback_dev;
-	dev_hold(rth->u.dst.dev);
-	rth->idev	= in_dev_get(rth->u.dst.dev);
+	rth->dst.dev	= init_net.loopback_dev;
+	dev_hold(rth->dst.dev);
+	rth->idev	= in_dev_get(rth->dst.dev);
 	rth->fl.oif	= 0;
 	rth->rt_gateway	= daddr;
 	rth->rt_spec_dst= spec_dst;
@@ -1902,27 +1918,25 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_flags	= RTCF_MULTICAST;
 	rth->rt_type	= RTN_MULTICAST;
 	if (our) {
-		rth->u.dst.input= ip_local_deliver;
+		rth->dst.input= ip_local_deliver;
 		rth->rt_flags |= RTCF_LOCAL;
 	}
 
 #ifdef CONFIG_IP_MROUTE
 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
-		rth->u.dst.input = ip_mr_input;
+		rth->dst.input = ip_mr_input;
 #endif
 	RT_CACHE_STAT_INC(in_slow_mc);
 
-	in_dev_put(in_dev);
 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
 	return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
 
 e_nobufs:
-	in_dev_put(in_dev);
 	return -ENOBUFS;
-
 e_inval:
-	in_dev_put(in_dev);
 	return -EINVAL;
+e_err:
+	return err;
 }
 
 
@@ -1956,22 +1970,22 @@ static void ip_handle_martian_source(struct net_device *dev,
 #endif
 }
 
+/* called in rcu_read_lock() section */
 static int __mkroute_input(struct sk_buff *skb,
 			   struct fib_result *res,
 			   struct in_device *in_dev,
 			   __be32 daddr, __be32 saddr, u32 tos,
 			   struct rtable **result)
 {
-
 	struct rtable *rth;
 	int err;
 	struct in_device *out_dev;
-	unsigned flags = 0;
+	unsigned int flags = 0;
 	__be32 spec_dst;
 	u32 itag;
 
 	/* get a working reference to the output device */
-	out_dev = in_dev_get(FIB_RES_DEV(*res));
+	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
 	if (out_dev == NULL) {
 		if (net_ratelimit())
 			printk(KERN_CRIT "Bug in ip_route_input" \
@@ -1986,7 +2000,6 @@ static int __mkroute_input(struct sk_buff *skb,
 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
 					 saddr);
 
-		err = -EINVAL;
 		goto cleanup;
 	}
 
@@ -2020,12 +2033,12 @@ static int __mkroute_input(struct sk_buff *skb,
 		goto cleanup;
 	}
 
-	atomic_set(&rth->u.dst.__refcnt, 1);
-	rth->u.dst.flags= DST_HOST;
+	atomic_set(&rth->dst.__refcnt, 1);
+	rth->dst.flags= DST_HOST;
 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
-		rth->u.dst.flags |= DST_NOPOLICY;
+		rth->dst.flags |= DST_NOPOLICY;
 	if (IN_DEV_CONF_GET(out_dev, NOXFRM))
-		rth->u.dst.flags |= DST_NOXFRM;
+		rth->dst.flags |= DST_NOXFRM;
 	rth->fl.fl4_dst	= daddr;
 	rth->rt_dst	= daddr;
 	rth->fl.fl4_tos	= tos;
@@ -2035,16 +2048,16 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_gateway	= daddr;
 	rth->rt_iif 	=
 		rth->fl.iif	= in_dev->dev->ifindex;
-	rth->u.dst.dev	= (out_dev)->dev;
-	dev_hold(rth->u.dst.dev);
-	rth->idev	= in_dev_get(rth->u.dst.dev);
+	rth->dst.dev	= (out_dev)->dev;
+	dev_hold(rth->dst.dev);
+	rth->idev	= in_dev_get(rth->dst.dev);
 	rth->fl.oif 	= 0;
 	rth->rt_spec_dst= spec_dst;
 
-	rth->u.dst.obsolete = -1;
-	rth->u.dst.input = ip_forward;
-	rth->u.dst.output = ip_output;
-	rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
+	rth->dst.obsolete = -1;
+	rth->dst.input = ip_forward;
+	rth->dst.output = ip_output;
+	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
 
 	rt_set_nexthop(rth, res, itag);
 
@@ -2053,8 +2066,6 @@ static int __mkroute_input(struct sk_buff *skb,
 	*result = rth;
 	err = 0;
  cleanup:
-	/* release the working reference to the output device */
-	in_dev_put(out_dev);
 	return err;
 }
 
@@ -2080,7 +2091,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
 
 	/* put it into the cache */
 	hash = rt_hash(daddr, saddr, fl->iif,
-		       rt_genid(dev_net(rth->u.dst.dev)));
+		       rt_genid(dev_net(rth->dst.dev)));
 	return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
 }
 
@@ -2092,13 +2103,14 @@ static int ip_mkroute_input(struct sk_buff *skb,
  *	Such approach solves two big problems:
  *	1. Not simplex devices are handled properly.
  *	2. IP spoofing attempts are filtered with 100% of guarantee.
+ *	called with rcu_read_lock()
  */
 
 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			       u8 tos, struct net_device *dev)
 {
 	struct fib_result res;
-	struct in_device *in_dev = in_dev_get(dev);
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
 	struct flowi fl = { .nl_u = { .ip4_u =
 				      { .daddr = daddr,
 					.saddr = saddr,
@@ -2113,7 +2125,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	unsigned	hash;
 	__be32		spec_dst;
 	int		err = -EINVAL;
-	int		free_res = 0;
 	struct net    * net = dev_net(dev);
 
 	/* IP on this device is disabled. */
@@ -2129,7 +2140,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	    ipv4_is_loopback(saddr))
 		goto martian_source;
 
-	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
+	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
 		goto brd_input;
 
 	/* Accept zero addresses only to limited broadcast;
@@ -2138,19 +2149,18 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (ipv4_is_zeronet(saddr))
 		goto martian_source;
 
-	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
-	    ipv4_is_loopback(daddr))
+	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
 		goto martian_destination;
 
 	/*
 	 *	Now we are ready to route packet.
 	 */
-	if ((err = fib_lookup(net, &fl, &res)) != 0) {
+	err = fib_lookup(net, &fl, &res);
+	if (err != 0) {
 		if (!IN_DEV_FORWARD(in_dev))
 			goto e_hostunreach;
 		goto no_route;
 	}
-	free_res = 1;
 
 	RT_CACHE_STAT_INC(in_slow_tot);
 
@@ -2158,13 +2168,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		goto brd_input;
 
 	if (res.type == RTN_LOCAL) {
-		int result;
-		result = fib_validate_source(saddr, daddr, tos,
-					     net->loopback_dev->ifindex,
-					     dev, &spec_dst, &itag, skb->mark);
-		if (result < 0)
-			goto martian_source;
-		if (result)
+		err = fib_validate_source(saddr, daddr, tos,
+					  net->loopback_dev->ifindex,
+					  dev, &spec_dst, &itag, skb->mark);
+		if (err < 0)
+			goto martian_source_keep_err;
+		if (err)
 			flags |= RTCF_DIRECTSRC;
 		spec_dst = daddr;
 		goto local_input;
@@ -2176,10 +2185,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		goto martian_destination;
 
 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
-done:
-	in_dev_put(in_dev);
-	if (free_res)
-		fib_res_put(&res);
 out:	return err;
 
 brd_input:
@@ -2192,7 +2197,7 @@ brd_input:
 		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
 					  &itag, skb->mark);
 		if (err < 0)
-			goto martian_source;
+			goto martian_source_keep_err;
 		if (err)
 			flags |= RTCF_DIRECTSRC;
 	}
@@ -2205,14 +2210,14 @@ local_input:
 	if (!rth)
 		goto e_nobufs;
 
-	rth->u.dst.output= ip_rt_bug;
-	rth->u.dst.obsolete = -1;
+	rth->dst.output= ip_rt_bug;
+	rth->dst.obsolete = -1;
 	rth->rt_genid = rt_genid(net);
 
-	atomic_set(&rth->u.dst.__refcnt, 1);
-	rth->u.dst.flags= DST_HOST;
+	atomic_set(&rth->dst.__refcnt, 1);
+	rth->dst.flags= DST_HOST;
 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
-		rth->u.dst.flags |= DST_NOPOLICY;
+		rth->dst.flags |= DST_NOPOLICY;
 	rth->fl.fl4_dst	= daddr;
 	rth->rt_dst	= daddr;
 	rth->fl.fl4_tos	= tos;
@@ -2220,26 +2225,26 @@ local_input:
 	rth->fl.fl4_src	= saddr;
 	rth->rt_src	= saddr;
 #ifdef CONFIG_NET_CLS_ROUTE
-	rth->u.dst.tclassid = itag;
+	rth->dst.tclassid = itag;
 #endif
 	rth->rt_iif	=
 	rth->fl.iif	= dev->ifindex;
-	rth->u.dst.dev	= net->loopback_dev;
-	dev_hold(rth->u.dst.dev);
-	rth->idev	= in_dev_get(rth->u.dst.dev);
+	rth->dst.dev	= net->loopback_dev;
+	dev_hold(rth->dst.dev);
+	rth->idev	= in_dev_get(rth->dst.dev);
 	rth->rt_gateway	= daddr;
 	rth->rt_spec_dst= spec_dst;
-	rth->u.dst.input= ip_local_deliver;
+	rth->dst.input= ip_local_deliver;
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	if (res.type == RTN_UNREACHABLE) {
-		rth->u.dst.input= ip_error;
-		rth->u.dst.error= -err;
+		rth->dst.input= ip_error;
+		rth->dst.error= -err;
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
 	rth->rt_type	= res.type;
 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
 	err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
-	goto done;
+	goto out;
 
 no_route:
 	RT_CACHE_STAT_INC(in_no_route);
@@ -2262,19 +2267,21 @@ martian_destination:
 
 e_hostunreach:
 	err = -EHOSTUNREACH;
-	goto done;
+	goto out;
 
 e_inval:
 	err = -EINVAL;
-	goto done;
+	goto out;
 
 e_nobufs:
 	err = -ENOBUFS;
-	goto done;
+	goto out;
 
 martian_source:
+	err = -EINVAL;
+martian_source_keep_err:
 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
-	goto e_inval;
+	goto out;
 }
 
 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2284,32 +2291,34 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	unsigned	hash;
 	int iif = dev->ifindex;
 	struct net *net;
+	int res;
 
 	net = dev_net(dev);
 
+	rcu_read_lock();
+
 	if (!rt_caching(net))
 		goto skip_cache;
 
 	tos &= IPTOS_RT_MASK;
 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
 
-	rcu_read_lock();
 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
-	     rth = rcu_dereference(rth->u.dst.rt_next)) {
+	     rth = rcu_dereference(rth->dst.rt_next)) {
 		if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
 		     ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
 		     (rth->fl.iif ^ iif) |
 		     rth->fl.oif |
 		     (rth->fl.fl4_tos ^ tos)) == 0 &&
 		    rth->fl.mark == skb->mark &&
-		    net_eq(dev_net(rth->u.dst.dev), net) &&
+		    net_eq(dev_net(rth->dst.dev), net) &&
 		    !rt_is_expired(rth)) {
 			if (noref) {
-				dst_use_noref(&rth->u.dst, jiffies);
-				skb_dst_set_noref(skb, &rth->u.dst);
+				dst_use_noref(&rth->dst, jiffies);
+				skb_dst_set_noref(skb, &rth->dst);
 			} else {
-				dst_use(&rth->u.dst, jiffies);
-				skb_dst_set(skb, &rth->u.dst);
+				dst_use(&rth->dst, jiffies);
+				skb_dst_set(skb, &rth->dst);
 			}
 			RT_CACHE_STAT_INC(in_hit);
 			rcu_read_unlock();
@@ -2317,7 +2326,6 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		}
 		RT_CACHE_STAT_INC(in_hlist_search);
 	}
-	rcu_read_unlock();
 
 skip_cache:
 	/* Multicast recognition logic is moved from route cache to here.
@@ -2332,12 +2340,11 @@ skip_cache:
 	   route cache entry is created eventually.
 	 */
 	if (ipv4_is_multicast(daddr)) {
-		struct in_device *in_dev;
+		struct in_device *in_dev = __in_dev_get_rcu(dev);
 
-		rcu_read_lock();
-		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
+		if (in_dev) {
 			int our = ip_check_mc(in_dev, daddr, saddr,
-				ip_hdr(skb)->protocol);
+					      ip_hdr(skb)->protocol);
 			if (our
 #ifdef CONFIG_IP_MROUTE
 				||
@@ -2345,18 +2352,22 @@ skip_cache:
 			     IN_DEV_MFORWARD(in_dev))
 #endif
 			   ) {
+				int res = ip_route_input_mc(skb, daddr, saddr,
+							    tos, dev, our);
 				rcu_read_unlock();
-				return ip_route_input_mc(skb, daddr, saddr,
-							 tos, dev, our);
+				return res;
 			}
 		}
 		rcu_read_unlock();
 		return -EINVAL;
 	}
-	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
+	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
+	rcu_read_unlock();
+	return res;
 }
 EXPORT_SYMBOL(ip_route_input_common);
 
+/* called with rcu_read_lock() */
 static int __mkroute_output(struct rtable **result,
 			    struct fib_result *res,
 			    const struct flowi *fl,
@@ -2367,60 +2378,54 @@ static int __mkroute_output(struct rtable **result,
 	struct rtable *rth;
 	struct in_device *in_dev;
 	u32 tos = RT_FL_TOS(oldflp);
-	int err = 0;
 
-	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
+	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
 		return -EINVAL;
 
-	if (fl->fl4_dst == htonl(0xFFFFFFFF))
+	if (ipv4_is_lbcast(fl->fl4_dst))
 		res->type = RTN_BROADCAST;
 	else if (ipv4_is_multicast(fl->fl4_dst))
 		res->type = RTN_MULTICAST;
-	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
+	else if (ipv4_is_zeronet(fl->fl4_dst))
 		return -EINVAL;
 
 	if (dev_out->flags & IFF_LOOPBACK)
 		flags |= RTCF_LOCAL;
 
-	/* get work reference to inet device */
-	in_dev = in_dev_get(dev_out);
+	in_dev = __in_dev_get_rcu(dev_out);
 	if (!in_dev)
 		return -EINVAL;
 
 	if (res->type == RTN_BROADCAST) {
 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
-		if (res->fi) {
-			fib_info_put(res->fi);
-			res->fi = NULL;
-		}
+		res->fi = NULL;
 	} else if (res->type == RTN_MULTICAST) {
-		flags |= RTCF_MULTICAST|RTCF_LOCAL;
+		flags |= RTCF_MULTICAST | RTCF_LOCAL;
 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
 				 oldflp->proto))
 			flags &= ~RTCF_LOCAL;
 		/* If multicast route do not exist use
-		   default one, but do not gateway in this case.
-		   Yes, it is hack.
+		 * default one, but do not gateway in this case.
+		 * Yes, it is hack.
 		 */
-		if (res->fi && res->prefixlen < 4) {
-			fib_info_put(res->fi);
+		if (res->fi && res->prefixlen < 4)
 			res->fi = NULL;
-		}
 	}
 
 
 	rth = dst_alloc(&ipv4_dst_ops);
-	if (!rth) {
-		err = -ENOBUFS;
-		goto cleanup;
-	}
+	if (!rth)
+		return -ENOBUFS;
+
+	in_dev_hold(in_dev);
+	rth->idev = in_dev;
 
-	atomic_set(&rth->u.dst.__refcnt, 1);
-	rth->u.dst.flags= DST_HOST;
+	atomic_set(&rth->dst.__refcnt, 1);
+	rth->dst.flags= DST_HOST;
 	if (IN_DEV_CONF_GET(in_dev, NOXFRM))
-		rth->u.dst.flags |= DST_NOXFRM;
+		rth->dst.flags |= DST_NOXFRM;
 	if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
-		rth->u.dst.flags |= DST_NOPOLICY;
+		rth->dst.flags |= DST_NOPOLICY;
 
 	rth->fl.fl4_dst	= oldflp->fl4_dst;
 	rth->fl.fl4_tos	= tos;
@@ -2432,35 +2437,34 @@ static int __mkroute_output(struct rtable **result,
 	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
 	/* get references to the devices that are to be hold by the routing
 	   cache entry */
-	rth->u.dst.dev	= dev_out;
+	rth->dst.dev	= dev_out;
 	dev_hold(dev_out);
-	rth->idev	= in_dev_get(dev_out);
 	rth->rt_gateway = fl->fl4_dst;
 	rth->rt_spec_dst= fl->fl4_src;
 
-	rth->u.dst.output=ip_output;
-	rth->u.dst.obsolete = -1;
+	rth->dst.output=ip_output;
+	rth->dst.obsolete = -1;
 	rth->rt_genid = rt_genid(dev_net(dev_out));
 
 	RT_CACHE_STAT_INC(out_slow_tot);
 
 	if (flags & RTCF_LOCAL) {
-		rth->u.dst.input = ip_local_deliver;
+		rth->dst.input = ip_local_deliver;
 		rth->rt_spec_dst = fl->fl4_dst;
 	}
 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
 		rth->rt_spec_dst = fl->fl4_src;
 		if (flags & RTCF_LOCAL &&
 		    !(dev_out->flags & IFF_LOOPBACK)) {
-			rth->u.dst.output = ip_mc_output;
+			rth->dst.output = ip_mc_output;
 			RT_CACHE_STAT_INC(out_slow_mc);
 		}
 #ifdef CONFIG_IP_MROUTE
 		if (res->type == RTN_MULTICAST) {
 			if (IN_DEV_MFORWARD(in_dev) &&
 			    !ipv4_is_local_multicast(oldflp->fl4_dst)) {
-				rth->u.dst.input = ip_mr_input;
-				rth->u.dst.output = ip_mc_output;
+				rth->dst.input = ip_mr_input;
+				rth->dst.output = ip_mc_output;
 			}
 		}
 #endif
@@ -2469,15 +2473,11 @@ static int __mkroute_output(struct rtable **result,
 	rt_set_nexthop(rth, res, 0);
 
 	rth->rt_flags = flags;
-
 	*result = rth;
- cleanup:
-	/* release work reference to inet device */
-	in_dev_put(in_dev);
-
-	return err;
+	return 0;
 }
 
+/* called with rcu_read_lock() */
 static int ip_mkroute_output(struct rtable **rp,
 			     struct fib_result *res,
 			     const struct flowi *fl,
@@ -2499,6 +2499,7 @@ static int ip_mkroute_output(struct rtable **rp,
 
 /*
  * Major route resolver routine.
+ * called with rcu_read_lock();
  */
 
 static int ip_route_output_slow(struct net *net, struct rtable **rp,
@@ -2517,9 +2518,8 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 			    .iif = net->loopback_dev->ifindex,
 			    .oif = oldflp->oif };
 	struct fib_result res;
-	unsigned flags = 0;
+	unsigned int flags = 0;
 	struct net_device *dev_out = NULL;
-	int free_res = 0;
 	int err;
 
 
@@ -2545,9 +2545,9 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 
 		if (oldflp->oif == 0 &&
 		    (ipv4_is_multicast(oldflp->fl4_dst) ||
-		     oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
+		     ipv4_is_lbcast(oldflp->fl4_dst))) {
 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
-			dev_out = ip_dev_find(net, oldflp->fl4_src);
+			dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
 			if (dev_out == NULL)
 				goto out;
 
@@ -2572,29 +2572,24 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 
 		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
-			dev_out = ip_dev_find(net, oldflp->fl4_src);
-			if (dev_out == NULL)
+			if (!__ip_dev_find(net, oldflp->fl4_src, false))
 				goto out;
-			dev_put(dev_out);
-			dev_out = NULL;
 		}
 	}
 
 
 	if (oldflp->oif) {
-		dev_out = dev_get_by_index(net, oldflp->oif);
+		dev_out = dev_get_by_index_rcu(net, oldflp->oif);
 		err = -ENODEV;
 		if (dev_out == NULL)
 			goto out;
 
 		/* RACE: Check return value of inet_select_addr instead. */
-		if (__in_dev_get_rtnl(dev_out) == NULL) {
-			dev_put(dev_out);
+		if (rcu_dereference(dev_out->ip_ptr) == NULL)
 			goto out;	/* Wrong error code */
-		}
 
 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
-		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
+		    ipv4_is_lbcast(oldflp->fl4_dst)) {
 			if (!fl.fl4_src)
 				fl.fl4_src = inet_select_addr(dev_out, 0,
 							      RT_SCOPE_LINK);
@@ -2614,10 +2609,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 		fl.fl4_dst = fl.fl4_src;
 		if (!fl.fl4_dst)
 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
-		if (dev_out)
-			dev_put(dev_out);
 		dev_out = net->loopback_dev;
-		dev_hold(dev_out);
 		fl.oif = net->loopback_dev->ifindex;
 		res.type = RTN_LOCAL;
 		flags |= RTCF_LOCAL;
@@ -2651,23 +2643,15 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 			res.type = RTN_UNICAST;
 			goto make_route;
 		}
-		if (dev_out)
-			dev_put(dev_out);
 		err = -ENETUNREACH;
 		goto out;
 	}
-	free_res = 1;
 
 	if (res.type == RTN_LOCAL) {
 		if (!fl.fl4_src)
 			fl.fl4_src = fl.fl4_dst;
-		if (dev_out)
-			dev_put(dev_out);
 		dev_out = net->loopback_dev;
-		dev_hold(dev_out);
 		fl.oif = dev_out->ifindex;
-		if (res.fi)
-			fib_info_put(res.fi);
 		res.fi = NULL;
 		flags |= RTCF_LOCAL;
 		goto make_route;
@@ -2684,28 +2668,21 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 	if (!fl.fl4_src)
 		fl.fl4_src = FIB_RES_PREFSRC(res);
 
-	if (dev_out)
-		dev_put(dev_out);
 	dev_out = FIB_RES_DEV(res);
-	dev_hold(dev_out);
 	fl.oif = dev_out->ifindex;
 
 
 make_route:
 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
 
-
-	if (free_res)
-		fib_res_put(&res);
-	if (dev_out)
-		dev_put(dev_out);
 out:	return err;
 }
 
 int __ip_route_output_key(struct net *net, struct rtable **rp,
 			  const struct flowi *flp)
 {
-	unsigned hash;
+	unsigned int hash;
+	int res;
 	struct rtable *rth;
 
 	if (!rt_caching(net))
@@ -2715,7 +2692,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
 
 	rcu_read_lock_bh();
 	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
-		rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
+		rth = rcu_dereference_bh(rth->dst.rt_next)) {
 		if (rth->fl.fl4_dst == flp->fl4_dst &&
 		    rth->fl.fl4_src == flp->fl4_src &&
 		    rth->fl.iif == 0 &&
@@ -2723,9 +2700,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
 		    rth->fl.mark == flp->mark &&
 		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
-		    net_eq(dev_net(rth->u.dst.dev), net) &&
+		    net_eq(dev_net(rth->dst.dev), net) &&
 		    !rt_is_expired(rth)) {
-			dst_use(&rth->u.dst, jiffies);
+			dst_use(&rth->dst, jiffies);
 			RT_CACHE_STAT_INC(out_hit);
 			rcu_read_unlock_bh();
 			*rp = rth;
@@ -2736,11 +2713,18 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
 	rcu_read_unlock_bh();
 
 slow_output:
-	return ip_route_output_slow(net, rp, flp);
+	rcu_read_lock();
+	res = ip_route_output_slow(net, rp, flp);
+	rcu_read_unlock();
+	return res;
 }
-
 EXPORT_SYMBOL_GPL(__ip_route_output_key);
 
+static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
+{
+	return NULL;
+}
+
 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
 {
 }
@@ -2749,9 +2733,8 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
 	.family			=	AF_INET,
 	.protocol		=	cpu_to_be16(ETH_P_IP),
 	.destroy		=	ipv4_dst_destroy,
-	.check			=	ipv4_dst_check,
+	.check			=	ipv4_blackhole_dst_check,
 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
-	.entries		=	ATOMIC_INIT(0),
 };
 
 
@@ -2762,15 +2745,15 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
 		dst_alloc(&ipv4_dst_blackhole_ops);
 
 	if (rt) {
-		struct dst_entry *new = &rt->u.dst;
+		struct dst_entry *new = &rt->dst;
 
 		atomic_set(&new->__refcnt, 1);
 		new->__use = 1;
 		new->input = dst_discard;
 		new->output = dst_discard;
-		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
+		memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
 
-		new->dev = ort->u.dst.dev;
+		new->dev = ort->dst.dev;
 		if (new->dev)
 			dev_hold(new->dev);
 
@@ -2794,9 +2777,9 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
 		dst_free(new);
 	}
 
-	dst_release(&(*rp)->u.dst);
+	dst_release(&(*rp)->dst);
 	*rp = rt;
-	return (rt ? 0 : -ENOMEM);
+	return rt ? 0 : -ENOMEM;
 }
 
 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
@@ -2822,13 +2805,13 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
 
 	return 0;
 }
-
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
 {
 	return ip_route_output_flow(net, rp, flp, NULL, 0);
 }
+EXPORT_SYMBOL(ip_route_output_key);
 
 static int rt_fill_info(struct net *net,
 			struct sk_buff *skb, u32 pid, u32 seq, int event,
@@ -2864,11 +2847,11 @@ static int rt_fill_info(struct net *net,
 		r->rtm_src_len = 32;
 		NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
 	}
-	if (rt->u.dst.dev)
-		NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
+	if (rt->dst.dev)
+		NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
 #ifdef CONFIG_NET_CLS_ROUTE
-	if (rt->u.dst.tclassid)
-		NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
+	if (rt->dst.tclassid)
+		NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
 #endif
 	if (rt->fl.iif)
 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
@@ -2878,12 +2861,16 @@ static int rt_fill_info(struct net *net,
 	if (rt->rt_dst != rt->rt_gateway)
 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
 
-	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
+	if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
 		goto nla_put_failure;
 
-	error = rt->u.dst.error;
-	expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
+	if (rt->fl.mark)
+		NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
+
+	error = rt->dst.error;
+	expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
 	if (rt->peer) {
+		inet_peer_refcheck(rt->peer);
 		id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
 		if (rt->peer->tcp_ts_stamp) {
 			ts = rt->peer->tcp_ts;
@@ -2914,7 +2901,7 @@ static int rt_fill_info(struct net *net,
 			NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
 	}
 
-	if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
+	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
 			       expires, error) < 0)
 		goto nla_put_failure;
 
@@ -2935,6 +2922,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 	__be32 src = 0;
 	u32 iif;
 	int err;
+	int mark;
 	struct sk_buff *skb;
 
 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
@@ -2962,6 +2950,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
+	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
 
 	if (iif) {
 		struct net_device *dev;
@@ -2974,13 +2963,14 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 
 		skb->protocol	= htons(ETH_P_IP);
 		skb->dev	= dev;
+		skb->mark	= mark;
 		local_bh_disable();
 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
 		local_bh_enable();
 
 		rt = skb_rtable(skb);
-		if (err == 0 && rt->u.dst.error)
-			err = -rt->u.dst.error;
+		if (err == 0 && rt->dst.error)
+			err = -rt->dst.error;
 	} else {
 		struct flowi fl = {
 			.nl_u = {
@@ -2991,6 +2981,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 				},
 			},
 			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
+			.mark = mark,
 		};
 		err = ip_route_output_key(net, &rt, &fl);
 	}
@@ -2998,7 +2989,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 	if (err)
 		goto errout_free;
 
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
@@ -3034,12 +3025,12 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 			continue;
 		rcu_read_lock_bh();
 		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
-		     rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
-			if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
+		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
+			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
 				continue;
 			if (rt_is_expired(rt))
 				continue;
-			skb_dst_set_noref(skb, &rt->u.dst);
+			skb_dst_set_noref(skb, &rt->dst);
 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
 					 1, NLM_F_MULTI) <= 0) {
@@ -3313,6 +3304,12 @@ int __init ip_rt_init(void)
 
 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
 
+	if (dst_entries_init(&ipv4_dst_ops) < 0)
+		panic("IP: failed to allocate ipv4_dst_ops counter\n");
+
+	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
+		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
+
 	rt_hash_table = (struct rt_hash_bucket *)
 		alloc_large_system_hash("IP route cache",
 					sizeof(struct rt_hash_bucket),
@@ -3365,6 +3362,3 @@ void __init ip_static_sysctl_init(void)
 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
 }
 #endif
-
-EXPORT_SYMBOL(__ip_select_ident);
-EXPORT_SYMBOL(ip_route_output_key);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 9f6b22206c5..650cace2180 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -18,8 +18,8 @@
 #include <net/tcp.h>
 #include <net/route.h>
 
-/* Timestamps: lowest 9 bits store TCP options */
-#define TSBITS 9
+/* Timestamps: lowest bits store TCP options */
+#define TSBITS 6
 #define TSMASK (((__u32)1 << TSBITS) - 1)
 
 extern int sysctl_tcp_syncookies;
@@ -58,7 +58,7 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
 
 /*
  * when syncookies are in effect and tcp timestamps are enabled we encode
- * tcp options in the lowest 9 bits of the timestamp value that will be
+ * tcp options in the lower bits of the timestamp value that will be
  * sent in the syn-ack.
  * Since subsequent timestamps use the normal tcp_time_stamp value, we
  * must make sure that the resulting initial timestamp is <= tcp_time_stamp.
@@ -70,11 +70,10 @@ __u32 cookie_init_timestamp(struct request_sock *req)
 	u32 options = 0;
 
 	ireq = inet_rsk(req);
-	if (ireq->wscale_ok) {
-		options = ireq->snd_wscale;
-		options |= ireq->rcv_wscale << 4;
-	}
-	options |= ireq->sack_ok << 8;
+
+	options = ireq->wscale_ok ? ireq->snd_wscale : 0xf;
+	options |= ireq->sack_ok << 4;
+	options |= ireq->ecn_ok << 5;
 
 	ts = ts_now & ~TSMASK;
 	ts |= options;
@@ -138,23 +137,23 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
 }
 
 /*
- * This table has to be sorted and terminated with (__u16)-1.
- * XXX generate a better table.
- * Unresolved Issues: HIPPI with a 64k MSS is not well supported.
+ * MSS Values are taken from the 2009 paper
+ * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson:
+ *  - values 1440 to 1460 accounted for 80% of observed mss values
+ *  - values outside the 536-1460 range are rare (<0.2%).
+ *
+ * Table must be sorted.
  */
 static __u16 const msstab[] = {
-	64 - 1,
-	256 - 1,
-	512 - 1,
-	536 - 1,
-	1024 - 1,
-	1440 - 1,
-	1460 - 1,
-	4312 - 1,
-	(__u16)-1
+	64,
+	512,
+	536,
+	1024,
+	1440,
+	1460,
+	4312,
+	8960,
 };
-/* The number doesn't include the -1 terminator */
-#define NUM_MSS (ARRAY_SIZE(msstab) - 1)
 
 /*
  * Generate a syncookie.  mssp points to the mss, which is returned
@@ -169,10 +168,10 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
 
 	tcp_synq_overflow(sk);
 
-	/* XXX sort msstab[] by probability?  Binary search? */
-	for (mssind = 0; mss > msstab[mssind + 1]; mssind++)
-		;
-	*mssp = msstab[mssind] + 1;
+	for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
+		if (mss >= msstab[mssind])
+			break;
+	*mssp = msstab[mssind];
 
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
 
@@ -202,7 +201,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
 					    jiffies / (HZ * 60),
 					    COUNTER_TRIES);
 
-	return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
+	return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
 }
 
 static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
@@ -227,26 +226,38 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
  * additional tcp options in the timestamp.
  * This extracts these options from the timestamp echo.
  *
- * The lowest 4 bits are for snd_wscale
- * The next 4 lsb are for rcv_wscale
- * The next lsb is for sack_ok
+ * The lowest 4 bits store snd_wscale.
+ * next 2 bits indicate SACK and ECN support.
+ *
+ * return false if we decode an option that should not be.
  */
-void cookie_check_timestamp(struct tcp_options_received *tcp_opt)
+bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)
 {
-	/* echoed timestamp, 9 lowest bits contain options */
+	/* echoed timestamp, lowest bits contain options */
 	u32 options = tcp_opt->rcv_tsecr & TSMASK;
 
-	tcp_opt->snd_wscale = options & 0xf;
-	options >>= 4;
-	tcp_opt->rcv_wscale = options & 0xf;
+	if (!tcp_opt->saw_tstamp)  {
+		tcp_clear_options(tcp_opt);
+		return true;
+	}
+
+	if (!sysctl_tcp_timestamps)
+		return false;
 
 	tcp_opt->sack_ok = (options >> 4) & 0x1;
+	*ecn_ok = (options >> 5) & 1;
+	if (*ecn_ok && !sysctl_tcp_ecn)
+		return false;
+
+	if (tcp_opt->sack_ok && !sysctl_tcp_sack)
+		return false;
 
-	if (tcp_opt->sack_ok)
-		tcp_sack_reset(tcp_opt);
+	if ((options & 0xf) == 0xf)
+		return true; /* no window scaling */
 
-	if (tcp_opt->snd_wscale || tcp_opt->rcv_wscale)
-		tcp_opt->wscale_ok = 1;
+	tcp_opt->wscale_ok = 1;
+	tcp_opt->snd_wscale = options & 0xf;
+	return sysctl_tcp_window_scaling != 0;
 }
 EXPORT_SYMBOL(cookie_check_timestamp);
 
@@ -265,8 +276,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	int mss;
 	struct rtable *rt;
 	__u8 rcv_wscale;
+	bool ecn_ok;
 
-	if (!sysctl_tcp_syncookies || !th->ack)
+	if (!sysctl_tcp_syncookies || !th->ack || th->rst)
 		goto out;
 
 	if (tcp_synq_no_recent_overflow(sk) ||
@@ -281,8 +293,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	memset(&tcp_opt, 0, sizeof(tcp_opt));
 	tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
 
-	if (tcp_opt.saw_tstamp)
-		cookie_check_timestamp(&tcp_opt);
+	if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
+		goto out;
 
 	ret = NULL;
 	req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
@@ -298,9 +310,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	ireq->rmt_port		= th->source;
 	ireq->loc_addr		= ip_hdr(skb)->daddr;
 	ireq->rmt_addr		= ip_hdr(skb)->saddr;
-	ireq->ecn_ok		= 0;
+	ireq->ecn_ok		= ecn_ok;
 	ireq->snd_wscale	= tcp_opt.snd_wscale;
-	ireq->rcv_wscale	= tcp_opt.rcv_wscale;
 	ireq->sack_ok		= tcp_opt.sack_ok;
 	ireq->wscale_ok		= tcp_opt.wscale_ok;
 	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
@@ -354,15 +365,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	}
 
 	/* Try to redo what tcp_v4_send_synack did. */
-	req->window_clamp = tp->window_clamp ? :dst_metric(&rt->u.dst, RTAX_WINDOW);
+	req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
 
 	tcp_select_initial_window(tcp_full_space(sk), req->mss,
 				  &req->rcv_wnd, &req->window_clamp,
 				  ireq->wscale_ok, &rcv_wscale,
-				  dst_metric(&rt->u.dst, RTAX_INITRWND));
+				  dst_metric(&rt->dst, RTAX_INITRWND));
 
 	ireq->rcv_wscale  = rcv_wscale;
 
-	ret = get_cookie_sock(sk, skb, req, &rt->u.dst);
+	ret = get_cookie_sock(sk, skb, req, &rt->dst);
 out:	return ret;
 }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 65afeaec15b..1664a0590bb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -315,7 +315,6 @@ struct tcp_splice_state {
  * is strict, actions are advisory and have some latency.
  */
 int tcp_memory_pressure __read_mostly;
-
 EXPORT_SYMBOL(tcp_memory_pressure);
 
 void tcp_enter_memory_pressure(struct sock *sk)
@@ -325,7 +324,6 @@ void tcp_enter_memory_pressure(struct sock *sk)
 		tcp_memory_pressure = 1;
 	}
 }
-
 EXPORT_SYMBOL(tcp_enter_memory_pressure);
 
 /* Convert seconds to retransmits based on initial and max timeout */
@@ -388,8 +386,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	 */
 
 	mask = 0;
-	if (sk->sk_err)
-		mask = POLLERR;
 
 	/*
 	 * POLLHUP is certainly not done right. But poll() doesn't
@@ -453,13 +449,20 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
 					mask |= POLLOUT | POLLWRNORM;
 			}
-		}
+		} else
+			mask |= POLLOUT | POLLWRNORM;
 
 		if (tp->urg_data & TCP_URG_VALID)
 			mask |= POLLPRI;
 	}
+	/* This barrier is coupled with smp_wmb() in tcp_reset() */
+	smp_rmb();
+	if (sk->sk_err)
+		mask |= POLLERR;
+
 	return mask;
 }
+EXPORT_SYMBOL(tcp_poll);
 
 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
@@ -508,10 +511,11 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 
 	return put_user(answ, (int __user *)arg);
 }
+EXPORT_SYMBOL(tcp_ioctl);
 
 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 {
-	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+	TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
 	tp->pushed_seq = tp->write_seq;
 }
 
@@ -527,7 +531,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
 
 	skb->csum    = 0;
 	tcb->seq     = tcb->end_seq = tp->write_seq;
-	tcb->flags   = TCPCB_FLAG_ACK;
+	tcb->flags   = TCPHDR_ACK;
 	tcb->sacked  = 0;
 	skb_header_release(skb);
 	tcp_add_write_queue_tail(sk, skb);
@@ -676,6 +680,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 
 	return ret;
 }
+EXPORT_SYMBOL(tcp_splice_read);
 
 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
 {
@@ -816,7 +821,7 @@ new_segment:
 		skb_shinfo(skb)->gso_segs = 0;
 
 		if (!copied)
-			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+			TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
 
 		copied += copy;
 		poffset += copy;
@@ -857,15 +862,15 @@ out_err:
 	return sk_stream_error(sk, flags, err);
 }
 
-ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
-		     size_t size, int flags)
+int tcp_sendpage(struct sock *sk, struct page *page, int offset,
+		 size_t size, int flags)
 {
 	ssize_t res;
-	struct sock *sk = sock->sk;
 
 	if (!(sk->sk_route_caps & NETIF_F_SG) ||
 	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
-		return sock_no_sendpage(sock, page, offset, size, flags);
+		return sock_no_sendpage(sk->sk_socket, page, offset, size,
+					flags);
 
 	lock_sock(sk);
 	TCP_CHECK_TIMER(sk);
@@ -874,6 +879,7 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 	release_sock(sk);
 	return res;
 }
+EXPORT_SYMBOL(tcp_sendpage);
 
 #define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
 #define TCP_OFF(sk)	(sk->sk_sndmsg_off)
@@ -898,10 +904,9 @@ static inline int select_size(struct sock *sk, int sg)
 	return tmp;
 }
 
-int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		size_t size)
 {
-	struct sock *sk = sock->sk;
 	struct iovec *iov;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
@@ -938,7 +943,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	sg = sk->sk_route_caps & NETIF_F_SG;
 
 	while (--iovlen >= 0) {
-		int seglen = iov->iov_len;
+		size_t seglen = iov->iov_len;
 		unsigned char __user *from = iov->iov_base;
 
 		iov++;
@@ -1062,7 +1067,7 @@ new_segment:
 			}
 
 			if (!copied)
-				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+				TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
 
 			tp->write_seq += copy;
 			TCP_SKB_CB(skb)->end_seq += copy;
@@ -1122,6 +1127,7 @@ out_err:
 	release_sock(sk);
 	return err;
 }
+EXPORT_SYMBOL(tcp_sendmsg);
 
 /*
  *	Handle reading urgent data. BSD has very simple semantics for
@@ -1381,6 +1387,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 		tcp_cleanup_rbuf(sk, copied);
 	return copied;
 }
+EXPORT_SYMBOL(tcp_read_sock);
 
 /*
  *	This routine copies from a sock struct into the user buffer.
@@ -1775,6 +1782,7 @@ recv_urg:
 	err = tcp_recv_urg(sk, msg, len, flags);
 	goto out;
 }
+EXPORT_SYMBOL(tcp_recvmsg);
 
 void tcp_set_state(struct sock *sk, int state)
 {
@@ -1867,6 +1875,7 @@ void tcp_shutdown(struct sock *sk, int how)
 			tcp_send_fin(sk);
 	}
 }
+EXPORT_SYMBOL(tcp_shutdown);
 
 void tcp_close(struct sock *sk, long timeout)
 {
@@ -1899,6 +1908,10 @@ void tcp_close(struct sock *sk, long timeout)
 
 	sk_mem_reclaim(sk);
 
+	/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
+	if (sk->sk_state == TCP_CLOSE)
+		goto adjudge_to_death;
+
 	/* As outlined in RFC 2525, section 2.17, we send a RST here because
 	 * data was lost. To witness the awful effects of the old behavior of
 	 * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
@@ -2002,11 +2015,8 @@ adjudge_to_death:
 		}
 	}
 	if (sk->sk_state != TCP_CLOSE) {
-		int orphan_count = percpu_counter_read_positive(
-						sk->sk_prot->orphan_count);
-
 		sk_mem_reclaim(sk);
-		if (tcp_too_many_orphans(sk, orphan_count)) {
+		if (tcp_too_many_orphans(sk, 0)) {
 			if (net_ratelimit())
 				printk(KERN_INFO "TCP: too many of orphaned "
 				       "sockets\n");
@@ -2026,6 +2036,7 @@ out:
 	local_bh_enable();
 	sock_put(sk);
 }
+EXPORT_SYMBOL(tcp_close);
 
 /* These states need RST on ABORT according to RFC793 */
 
@@ -2099,6 +2110,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	sk->sk_error_report(sk);
 	return err;
 }
+EXPORT_SYMBOL(tcp_disconnect);
 
 /*
  *	Socket option code for TCP.
@@ -2176,6 +2188,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 				      GFP_KERNEL);
 			if (cvp == NULL)
 				return -ENOMEM;
+
+			kref_init(&cvp->kref);
 		}
 		lock_sock(sk);
 		tp->rx_opt.cookie_in_always =
@@ -2190,12 +2204,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 				 */
 				kref_put(&tp->cookie_values->kref,
 					 tcp_cookie_values_release);
-				kref_init(&cvp->kref);
-				tp->cookie_values = cvp;
 			} else {
 				cvp = tp->cookie_values;
 			}
 		}
+
 		if (cvp != NULL) {
 			cvp->cookie_desired = ctd.tcpct_cookie_desired;
 
@@ -2209,6 +2222,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 				cvp->s_data_desired = ctd.tcpct_s_data_desired;
 				cvp->s_data_constant = 0; /* false */
 			}
+
+			tp->cookie_values = cvp;
 		}
 		release_sock(sk);
 		return err;
@@ -2377,7 +2392,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		err = tp->af_specific->md5_parse(sk, optval, optlen);
 		break;
 #endif
-
+	case TCP_USER_TIMEOUT:
+		/* Cap the max timeout in ms TCP will retry/retrans
+		 * before giving up and aborting (ETIMEDOUT) a connection.
+		 */
+		icsk->icsk_user_timeout = msecs_to_jiffies(val);
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -2397,6 +2417,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 						     optval, optlen);
 	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
 }
+EXPORT_SYMBOL(tcp_setsockopt);
 
 #ifdef CONFIG_COMPAT
 int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
@@ -2407,7 +2428,6 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
 						  optval, optlen);
 	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
 }
-
 EXPORT_SYMBOL(compat_tcp_setsockopt);
 #endif
 
@@ -2473,7 +2493,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 
 	info->tcpi_total_retrans = tp->total_retrans;
 }
-
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
 static int do_tcp_getsockopt(struct sock *sk, int level,
@@ -2591,6 +2610,16 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 			return -EFAULT;
 		return 0;
 	}
+	case TCP_THIN_LINEAR_TIMEOUTS:
+		val = tp->thin_lto;
+		break;
+	case TCP_THIN_DUPACK:
+		val = tp->thin_dupack;
+		break;
+
+	case TCP_USER_TIMEOUT:
+		val = jiffies_to_msecs(icsk->icsk_user_timeout);
+		break;
 	default:
 		return -ENOPROTOOPT;
 	}
@@ -2612,6 +2641,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 						     optval, optlen);
 	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
 }
+EXPORT_SYMBOL(tcp_getsockopt);
 
 #ifdef CONFIG_COMPAT
 int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
@@ -2622,7 +2652,6 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
 						  optval, optlen);
 	return do_tcp_getsockopt(sk, level, optname, optval, optlen);
 }
-
 EXPORT_SYMBOL(compat_tcp_getsockopt);
 #endif
 
@@ -2859,7 +2888,6 @@ void tcp_free_md5sig_pool(void)
 	if (pool)
 		__tcp_free_md5sig_pool(pool);
 }
-
 EXPORT_SYMBOL(tcp_free_md5sig_pool);
 
 static struct tcp_md5sig_pool * __percpu *
@@ -2935,7 +2963,6 @@ retry:
 	}
 	return pool;
 }
-
 EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
 
 
@@ -2959,7 +2986,7 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
 	spin_unlock(&tcp_md5sig_pool_lock);
 
 	if (p)
-		return *per_cpu_ptr(p, smp_processor_id());
+		return *this_cpu_ptr(p);
 
 	local_bh_enable();
 	return NULL;
@@ -2987,7 +3014,6 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
 	th->check = old_checksum;
 	return err;
 }
-
 EXPORT_SYMBOL(tcp_md5_hash_header);
 
 int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
@@ -3000,6 +3026,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
 	const unsigned head_data_len = skb_headlen(skb) > header_len ?
 				       skb_headlen(skb) - header_len : 0;
 	const struct skb_shared_info *shi = skb_shinfo(skb);
+	struct sk_buff *frag_iter;
 
 	sg_init_table(&sg, 1);
 
@@ -3014,9 +3041,12 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
 			return 1;
 	}
 
+	skb_walk_frags(skb, frag_iter)
+		if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
+			return 1;
+
 	return 0;
 }
-
 EXPORT_SYMBOL(tcp_md5_hash_skb_data);
 
 int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
@@ -3026,7 +3056,6 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
 	sg_init_one(&sg, key->key, key->keylen);
 	return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
 }
-
 EXPORT_SYMBOL(tcp_md5_hash_key);
 
 #endif
@@ -3193,7 +3222,7 @@ void __init tcp_init(void)
 {
 	struct sk_buff *skb = NULL;
 	unsigned long nr_pages, limit;
-	int order, i, max_share;
+	int i, max_share, cnt;
 	unsigned long jiffy = jiffies;
 
 	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3242,22 +3271,12 @@ void __init tcp_init(void)
 		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
 	}
 
-	/* Try to be a bit smarter and adjust defaults depending
-	 * on available memory.
-	 */
-	for (order = 0; ((1 << order) << PAGE_SHIFT) <
-			(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
-			order++)
-		;
-	if (order >= 4) {
-		tcp_death_row.sysctl_max_tw_buckets = 180000;
-		sysctl_tcp_max_orphans = 4096 << (order - 4);
-		sysctl_max_syn_backlog = 1024;
-	} else if (order < 3) {
-		tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
-		sysctl_tcp_max_orphans >>= (3 - order);
-		sysctl_max_syn_backlog = 128;
-	}
+
+	cnt = tcp_hashinfo.ehash_mask + 1;
+
+	tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
+	sysctl_tcp_max_orphans = cnt / 2;
+	sysctl_max_syn_backlog = max(128, cnt / 256);
 
 	/* Set the pressure threshold to be a fraction of global memory that
 	 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
@@ -3298,16 +3317,3 @@ void __init tcp_init(void)
 	tcp_secret_retiring = &tcp_secret_two;
 	tcp_secret_secondary = &tcp_secret_two;
 }
-
-EXPORT_SYMBOL(tcp_close);
-EXPORT_SYMBOL(tcp_disconnect);
-EXPORT_SYMBOL(tcp_getsockopt);
-EXPORT_SYMBOL(tcp_ioctl);
-EXPORT_SYMBOL(tcp_poll);
-EXPORT_SYMBOL(tcp_read_sock);
-EXPORT_SYMBOL(tcp_recvmsg);
-EXPORT_SYMBOL(tcp_sendmsg);
-EXPORT_SYMBOL(tcp_splice_read);
-EXPORT_SYMBOL(tcp_sendpage);
-EXPORT_SYMBOL(tcp_setsockopt);
-EXPORT_SYMBOL(tcp_shutdown);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 0ec9bd0ae94..850c737e08e 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -196,10 +196,10 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
 int tcp_set_allowed_congestion_control(char *val)
 {
 	struct tcp_congestion_ops *ca;
-	char *clone, *name;
+	char *saved_clone, *clone, *name;
 	int ret = 0;
 
-	clone = kstrdup(val, GFP_USER);
+	saved_clone = clone = kstrdup(val, GFP_USER);
 	if (!clone)
 		return -ENOMEM;
 
@@ -226,6 +226,7 @@ int tcp_set_allowed_congestion_control(char *val)
 	}
 out:
 	spin_unlock(&tcp_cong_list_lock);
+	kfree(saved_clone);
 
 	return ret;
 }
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 1eba160b72d..00ca688d896 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -6,7 +6,7 @@
  * The algorithm is described in:
  * "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm
  *  for High-Speed Networks"
- * http://www.ews.uiuc.edu/~shaoliu/papersandslides/liubassri06perf.pdf
+ * http://www.ifp.illinois.edu/~srikant/Papers/liubassri06perf.pdf
  *
  * Implemented from description in paper and ns-2 simulation.
  * Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org>
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 548d575e6cc..3357f69e353 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -78,10 +78,13 @@ int sysctl_tcp_window_scaling __read_mostly = 1;
 int sysctl_tcp_sack __read_mostly = 1;
 int sysctl_tcp_fack __read_mostly = 1;
 int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
+EXPORT_SYMBOL(sysctl_tcp_reordering);
 int sysctl_tcp_ecn __read_mostly = 2;
+EXPORT_SYMBOL(sysctl_tcp_ecn);
 int sysctl_tcp_dsack __read_mostly = 1;
 int sysctl_tcp_app_win __read_mostly = 31;
 int sysctl_tcp_adv_win_scale __read_mostly = 2;
+EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
 
 int sysctl_tcp_stdurg __read_mostly;
 int sysctl_tcp_rfc1337 __read_mostly;
@@ -179,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk)
 		icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
 }
 
-void tcp_enter_quickack_mode(struct sock *sk)
+static void tcp_enter_quickack_mode(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	tcp_incr_quickack(sk);
@@ -419,15 +422,16 @@ void tcp_initialize_rcv_mss(struct sock *sk)
 
 	inet_csk(sk)->icsk_ack.rcv_mss = hint;
 }
+EXPORT_SYMBOL(tcp_initialize_rcv_mss);
 
 /* Receiver "autotuning" code.
  *
  * The algorithm for RTT estimation w/o timestamps is based on
  * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
- * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps>
+ * <http://public.lanl.gov/radiant/pubs.html#DRS>
  *
  * More detail on this code can be found at
- * <http://www.psc.edu/~jheffner/senior_thesis.ps>,
+ * <http://staff.psc.edu/jheffner/>,
  * though this reference is out of date.  A new paper
  * is pending.
  */
@@ -801,25 +805,12 @@ void tcp_update_metrics(struct sock *sk)
 	}
 }
 
-/* Numbers are taken from RFC3390.
- *
- * John Heffner states:
- *
- *	The RFC specifies a window of no more than 4380 bytes
- *	unless 2*MSS > 4380.  Reading the pseudocode in the RFC
- *	is a bit misleading because they use a clamp at 4380 bytes
- *	rather than use a multiplier in the relevant range.
- */
 __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
 {
 	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
 
-	if (!cwnd) {
-		if (tp->mss_cache > 1460)
-			cwnd = 2;
-		else
-			cwnd = (tp->mss_cache > 1095) ? 3 : 4;
-	}
+	if (!cwnd)
+		cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
 	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
 
@@ -2310,7 +2301,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
 
 static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
 {
-	return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
+	return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
 }
 
 static inline int tcp_head_timedout(struct sock *sk)
@@ -2504,7 +2495,7 @@ static void tcp_timeout_skbs(struct sock *sk)
 /* Mark head of queue up as lost. With RFC3517 SACK, the packets is
  * is against sacked "cnt", otherwise it's against facked "cnt"
  */
-static void tcp_mark_head_lost(struct sock *sk, int packets)
+static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
@@ -2512,13 +2503,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
 	int err;
 	unsigned int mss;
 
-	if (packets == 0)
-		return;
-
 	WARN_ON(packets > tp->packets_out);
 	if (tp->lost_skb_hint) {
 		skb = tp->lost_skb_hint;
 		cnt = tp->lost_cnt_hint;
+		/* Head already handled? */
+		if (mark_head && skb != tcp_write_queue_head(sk))
+			return;
 	} else {
 		skb = tcp_write_queue_head(sk);
 		cnt = 0;
@@ -2541,7 +2532,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
 			cnt += tcp_skb_pcount(skb);
 
 		if (cnt > packets) {
-			if (tcp_is_sack(tp) || (oldcnt >= packets))
+			if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
+			    (oldcnt >= packets))
 				break;
 
 			mss = skb_shinfo(skb)->gso_size;
@@ -2552,6 +2544,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
 		}
 
 		tcp_skb_mark_lost(tp, skb);
+
+		if (mark_head)
+			break;
 	}
 	tcp_verify_left_out(tp);
 }
@@ -2563,17 +2558,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (tcp_is_reno(tp)) {
-		tcp_mark_head_lost(sk, 1);
+		tcp_mark_head_lost(sk, 1, 1);
 	} else if (tcp_is_fack(tp)) {
 		int lost = tp->fackets_out - tp->reordering;
 		if (lost <= 0)
 			lost = 1;
-		tcp_mark_head_lost(sk, lost);
+		tcp_mark_head_lost(sk, lost, 0);
 	} else {
 		int sacked_upto = tp->sacked_out - tp->reordering;
-		if (sacked_upto < fast_rexmit)
-			sacked_upto = fast_rexmit;
-		tcp_mark_head_lost(sk, sacked_upto);
+		if (sacked_upto >= 0)
+			tcp_mark_head_lost(sk, sacked_upto, 0);
+		else if (fast_rexmit)
+			tcp_mark_head_lost(sk, 1, 1);
 	}
 
 	tcp_timeout_skbs(sk);
@@ -2882,7 +2878,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
 		       icsk->icsk_mtup.probe_size;
 	tp->snd_cwnd_cnt = 0;
 	tp->snd_cwnd_stamp = tcp_time_stamp;
-	tp->rcv_ssthresh = tcp_current_ssthresh(sk);
+	tp->snd_ssthresh = tcp_current_ssthresh(sk);
 
 	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
 	icsk->icsk_mtup.probe_size = 0;
@@ -2938,6 +2934,7 @@ void tcp_simple_retransmit(struct sock *sk)
 	}
 	tcp_xmit_retransmit_queue(sk);
 }
+EXPORT_SYMBOL(tcp_simple_retransmit);
 
 /* Process an event, which can update packets-in-flight not trivially.
  * Main goal of this function is to calculate new estimate for left_out,
@@ -2978,7 +2975,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
 	    before(tp->snd_una, tp->high_seq) &&
 	    icsk->icsk_ca_state != TCP_CA_Open &&
 	    tp->fackets_out > tp->reordering) {
-		tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering);
+		tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
 	}
 
@@ -3286,7 +3283,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		 * connection startup slow start one packet too
 		 * quickly.  This is severely frowned upon behavior.
 		 */
-		if (!(scb->flags & TCPCB_FLAG_SYN)) {
+		if (!(scb->flags & TCPHDR_SYN)) {
 			flag |= FLAG_DATA_ACKED;
 		} else {
 			flag |= FLAG_SYN_ACKED;
@@ -3406,8 +3403,8 @@ static void tcp_ack_probe(struct sock *sk)
 
 static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
 {
-	return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
-		inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
+	return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
+		inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
 }
 
 static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
@@ -3424,9 +3421,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp,
 					const u32 ack, const u32 ack_seq,
 					const u32 nwin)
 {
-	return (after(ack, tp->snd_una) ||
+	return	after(ack, tp->snd_una) ||
 		after(ack_seq, tp->snd_wl1) ||
-		(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
+		(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
 }
 
 /* Update our send window.
@@ -3858,6 +3855,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
 		}
 	}
 }
+EXPORT_SYMBOL(tcp_parse_options);
 
 static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
 {
@@ -3924,13 +3922,14 @@ u8 *tcp_parse_md5sig_option(struct tcphdr *th)
 			if (opsize < 2 || opsize > length)
 				return NULL;
 			if (opcode == TCPOPT_MD5SIG)
-				return ptr;
+				return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
 		}
 		ptr += opsize - 2;
 		length -= opsize;
 	}
 	return NULL;
 }
+EXPORT_SYMBOL(tcp_parse_md5sig_option);
 #endif
 
 static inline void tcp_store_ts_recent(struct tcp_sock *tp)
@@ -4041,6 +4040,8 @@ static void tcp_reset(struct sock *sk)
 	default:
 		sk->sk_err = ECONNRESET;
 	}
+	/* This barrier is coupled with smp_rmb() in tcp_poll() */
+	smp_wmb();
 
 	if (!sock_flag(sk, SOCK_DEAD))
 		sk->sk_error_report(sk);
@@ -5432,6 +5433,7 @@ discard:
 	__kfree_skb(skb);
 	return 0;
 }
+EXPORT_SYMBOL(tcp_rcv_established);
 
 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 					 struct tcphdr *th, unsigned len)
@@ -5931,14 +5933,4 @@ discard:
 	}
 	return 0;
 }
-
-EXPORT_SYMBOL(sysctl_tcp_ecn);
-EXPORT_SYMBOL(sysctl_tcp_reordering);
-EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
-EXPORT_SYMBOL(tcp_parse_options);
-#ifdef CONFIG_TCP_MD5SIG
-EXPORT_SYMBOL(tcp_parse_md5sig_option);
-#endif
-EXPORT_SYMBOL(tcp_rcv_established);
 EXPORT_SYMBOL(tcp_rcv_state_process);
-EXPORT_SYMBOL(tcp_initialize_rcv_mss);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fe193e53af4..8f8527d4168 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -84,6 +84,7 @@
 
 int sysctl_tcp_tw_reuse __read_mostly;
 int sysctl_tcp_low_latency __read_mostly;
+EXPORT_SYMBOL(sysctl_tcp_low_latency);
 
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -100,6 +101,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 #endif
 
 struct inet_hashinfo tcp_hashinfo;
+EXPORT_SYMBOL(tcp_hashinfo);
 
 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 {
@@ -139,7 +141,6 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 
 	return 0;
 }
-
 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 
 /* This will initiate an outgoing connection. */
@@ -204,10 +205,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 		 * when trying new connection.
 		 */
-		if (peer != NULL &&
-		    (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
-			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
-			tp->rx_opt.ts_recent = peer->tcp_ts;
+		if (peer) {
+			inet_peer_refcheck(peer);
+			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
+				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
+				tp->rx_opt.ts_recent = peer->tcp_ts;
+			}
 		}
 	}
 
@@ -237,7 +240,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	/* OK, now commit destination to socket.  */
 	sk->sk_gso_type = SKB_GSO_TCPV4;
-	sk_setup_caps(sk, &rt->u.dst);
+	sk_setup_caps(sk, &rt->dst);
 
 	if (!tp->write_seq)
 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
@@ -265,6 +268,7 @@ failure:
 	inet->inet_dport = 0;
 	return err;
 }
+EXPORT_SYMBOL(tcp_v4_connect);
 
 /*
  * This routine does path mtu discovery as defined in RFC1191.
@@ -543,6 +547,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 
 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 }
+EXPORT_SYMBOL(tcp_v4_send_check);
 
 int tcp_v4_gso_send_check(struct sk_buff *skb)
 {
@@ -793,19 +798,20 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
 	kfree(inet_rsk(req)->opt);
 }
 
-#ifdef CONFIG_SYN_COOKIES
-static void syn_flood_warning(struct sk_buff *skb)
+static void syn_flood_warning(const struct sk_buff *skb)
 {
-	static unsigned long warntime;
+	const char *msg;
 
-	if (time_after(jiffies, (warntime + HZ * 60))) {
-		warntime = jiffies;
-		printk(KERN_INFO
-		       "possible SYN flooding on port %d. Sending cookies.\n",
-		       ntohs(tcp_hdr(skb)->dest));
-	}
-}
+#ifdef CONFIG_SYN_COOKIES
+	if (sysctl_tcp_syncookies)
+		msg = "Sending cookies";
+	else
 #endif
+		msg = "Dropping request";
+
+	pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
+				ntohs(tcp_hdr(skb)->dest), msg);
+}
 
 /*
  * Save and compile IPv4 options into the request_sock if needed.
@@ -857,7 +863,6 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 {
 	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 }
-
 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 
 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
@@ -924,7 +929,6 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 	}
 	return 0;
 }
-
 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 
 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
@@ -962,7 +966,6 @@ int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 	}
 	return -ENOENT;
 }
-
 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 
 static void tcp_v4_clear_md5_list(struct sock *sk)
@@ -1135,7 +1138,6 @@ clear_hash_noput:
 	memset(md5_hash, 0, 16);
 	return 1;
 }
-
 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
 
 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
@@ -1243,6 +1245,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	 * evidently real one.
 	 */
 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
+		if (net_ratelimit())
+			syn_flood_warning(skb);
 #ifdef CONFIG_SYN_COOKIES
 		if (sysctl_tcp_syncookies) {
 			want_cookie = 1;
@@ -1323,15 +1327,12 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (security_inet_conn_request(sk, skb, req))
 		goto drop_and_free;
 
-	if (!want_cookie)
+	if (!want_cookie || tmp_opt.tstamp_ok)
 		TCP_ECN_create_request(req, tcp_hdr(skb));
 
 	if (want_cookie) {
-#ifdef CONFIG_SYN_COOKIES
-		syn_flood_warning(skb);
-		req->cookie_ts = tmp_opt.tstamp_ok;
-#endif
 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
+		req->cookie_ts = tmp_opt.tstamp_ok;
 	} else if (!isn) {
 		struct inet_peer *peer = NULL;
 
@@ -1349,6 +1350,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		    (dst = inet_csk_route_req(sk, req)) != NULL &&
 		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
 		    peer->v4daddr == saddr) {
+			inet_peer_refcheck(peer);
 			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
 			    (s32)(peer->tcp_ts - req->ts_recent) >
 							TCP_PAWS_WINDOW) {
@@ -1393,6 +1395,7 @@ drop_and_free:
 drop:
 	return 0;
 }
+EXPORT_SYMBOL(tcp_v4_conn_request);
 
 
 /*
@@ -1419,7 +1422,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	newsk = tcp_create_openreq_child(sk, req, skb);
 	if (!newsk)
-		goto exit;
+		goto exit_nonewsk;
 
 	newsk->sk_gso_type = SKB_GSO_TCPV4;
 	sk_setup_caps(newsk, dst);
@@ -1466,18 +1469,23 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	}
 #endif
 
+	if (__inet_inherit_port(sk, newsk) < 0) {
+		sock_put(newsk);
+		goto exit;
+	}
 	__inet_hash_nolisten(newsk, NULL);
-	__inet_inherit_port(sk, newsk);
 
 	return newsk;
 
 exit_overflow:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+	dst_release(dst);
 exit:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
-	dst_release(dst);
 	return NULL;
 }
+EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
 
 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 {
@@ -1504,7 +1512,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 	}
 
 #ifdef CONFIG_SYN_COOKIES
-	if (!th->rst && !th->syn && th->ack)
+	if (!th->syn)
 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
 #endif
 	return sk;
@@ -1607,6 +1615,7 @@ csum_err:
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
 	goto discard;
 }
+EXPORT_SYMBOL(tcp_v4_do_rcv);
 
 /*
  *	From tcp_input.c
@@ -1793,6 +1802,7 @@ int tcp_v4_remember_stamp(struct sock *sk)
 
 	return 0;
 }
+EXPORT_SYMBOL(tcp_v4_remember_stamp);
 
 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
 {
@@ -1832,6 +1842,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
 	.compat_getsockopt = compat_ip_getsockopt,
 #endif
 };
+EXPORT_SYMBOL(ipv4_specific);
 
 #ifdef CONFIG_TCP_MD5SIG
 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
@@ -1960,7 +1971,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
 
 	percpu_counter_dec(&tcp_sockets_allocated);
 }
-
 EXPORT_SYMBOL(tcp_v4_destroy_sock);
 
 #ifdef CONFIG_PROC_FS
@@ -1978,6 +1988,11 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
 		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
 }
 
+/*
+ * Get next listener socket follow cur.  If cur is NULL, get first socket
+ * starting from bucket given in st->bucket; when st->bucket is zero the
+ * very first socket in the hash table is returned.
+ */
 static void *listening_get_next(struct seq_file *seq, void *cur)
 {
 	struct inet_connection_sock *icsk;
@@ -1988,14 +2003,15 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
 	struct net *net = seq_file_net(seq);
 
 	if (!sk) {
-		st->bucket = 0;
-		ilb = &tcp_hashinfo.listening_hash[0];
+		ilb = &tcp_hashinfo.listening_hash[st->bucket];
 		spin_lock_bh(&ilb->lock);
 		sk = sk_nulls_head(&ilb->head);
+		st->offset = 0;
 		goto get_sk;
 	}
 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
 	++st->num;
+	++st->offset;
 
 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
 		struct request_sock *req = cur;
@@ -2010,6 +2026,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
 				}
 				req = req->dl_next;
 			}
+			st->offset = 0;
 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
 				break;
 get_req:
@@ -2045,6 +2062,7 @@ start_req:
 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
 	}
 	spin_unlock_bh(&ilb->lock);
+	st->offset = 0;
 	if (++st->bucket < INET_LHTABLE_SIZE) {
 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
 		spin_lock_bh(&ilb->lock);
@@ -2058,7 +2076,12 @@ out:
 
 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
 {
-	void *rc = listening_get_next(seq, NULL);
+	struct tcp_iter_state *st = seq->private;
+	void *rc;
+
+	st->bucket = 0;
+	st->offset = 0;
+	rc = listening_get_next(seq, NULL);
 
 	while (rc && *pos) {
 		rc = listening_get_next(seq, rc);
@@ -2073,13 +2096,18 @@ static inline int empty_bucket(struct tcp_iter_state *st)
 		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
 }
 
+/*
+ * Get first established socket starting from bucket given in st->bucket.
+ * If st->bucket is zero, the very first socket in the hash is returned.
+ */
 static void *established_get_first(struct seq_file *seq)
 {
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
 	void *rc = NULL;
 
-	for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
+	st->offset = 0;
+	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
 		struct sock *sk;
 		struct hlist_nulls_node *node;
 		struct inet_timewait_sock *tw;
@@ -2124,6 +2152,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
 	struct net *net = seq_file_net(seq);
 
 	++st->num;
+	++st->offset;
 
 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
 		tw = cur;
@@ -2140,6 +2169,7 @@ get_tw:
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 
 		/* Look for next non empty bucket */
+		st->offset = 0;
 		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
 				empty_bucket(st))
 			;
@@ -2167,7 +2197,11 @@ out:
 
 static void *established_get_idx(struct seq_file *seq, loff_t pos)
 {
-	void *rc = established_get_first(seq);
+	struct tcp_iter_state *st = seq->private;
+	void *rc;
+
+	st->bucket = 0;
+	rc = established_get_first(seq);
 
 	while (rc && pos) {
 		rc = established_get_next(seq, rc);
@@ -2192,24 +2226,72 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
 	return rc;
 }
 
+static void *tcp_seek_last_pos(struct seq_file *seq)
+{
+	struct tcp_iter_state *st = seq->private;
+	int offset = st->offset;
+	int orig_num = st->num;
+	void *rc = NULL;
+
+	switch (st->state) {
+	case TCP_SEQ_STATE_OPENREQ:
+	case TCP_SEQ_STATE_LISTENING:
+		if (st->bucket >= INET_LHTABLE_SIZE)
+			break;
+		st->state = TCP_SEQ_STATE_LISTENING;
+		rc = listening_get_next(seq, NULL);
+		while (offset-- && rc)
+			rc = listening_get_next(seq, rc);
+		if (rc)
+			break;
+		st->bucket = 0;
+		/* Fallthrough */
+	case TCP_SEQ_STATE_ESTABLISHED:
+	case TCP_SEQ_STATE_TIME_WAIT:
+		st->state = TCP_SEQ_STATE_ESTABLISHED;
+		if (st->bucket > tcp_hashinfo.ehash_mask)
+			break;
+		rc = established_get_first(seq);
+		while (offset-- && rc)
+			rc = established_get_next(seq, rc);
+	}
+
+	st->num = orig_num;
+
+	return rc;
+}
+
 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct tcp_iter_state *st = seq->private;
+	void *rc;
+
+	if (*pos && *pos == st->last_pos) {
+		rc = tcp_seek_last_pos(seq);
+		if (rc)
+			goto out;
+	}
+
 	st->state = TCP_SEQ_STATE_LISTENING;
 	st->num = 0;
-	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+	st->bucket = 0;
+	st->offset = 0;
+	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+
+out:
+	st->last_pos = *pos;
+	return rc;
 }
 
 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
+	struct tcp_iter_state *st = seq->private;
 	void *rc = NULL;
-	struct tcp_iter_state *st;
 
 	if (v == SEQ_START_TOKEN) {
 		rc = tcp_get_idx(seq, 0);
 		goto out;
 	}
-	st = seq->private;
 
 	switch (st->state) {
 	case TCP_SEQ_STATE_OPENREQ:
@@ -2217,6 +2299,8 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 		rc = listening_get_next(seq, v);
 		if (!rc) {
 			st->state = TCP_SEQ_STATE_ESTABLISHED;
+			st->bucket = 0;
+			st->offset = 0;
 			rc	  = established_get_first(seq);
 		}
 		break;
@@ -2227,6 +2311,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	}
 out:
 	++*pos;
+	st->last_pos = *pos;
 	return rc;
 }
 
@@ -2265,6 +2350,7 @@ static int tcp_seq_open(struct inode *inode, struct file *file)
 
 	s = ((struct seq_file *)file->private_data)->private;
 	s->family		= afinfo->family;
+	s->last_pos 		= 0;
 	return 0;
 }
 
@@ -2288,11 +2374,13 @@ int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
 		rc = -ENOMEM;
 	return rc;
 }
+EXPORT_SYMBOL(tcp_proc_register);
 
 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
 {
 	proc_net_remove(net, afinfo->name);
 }
+EXPORT_SYMBOL(tcp_proc_unregister);
 
 static void get_openreq4(struct sock *sk, struct request_sock *req,
 			 struct seq_file *f, int i, int uid, int *len)
@@ -2487,7 +2575,6 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 
 	return tcp_gro_receive(head, skb);
 }
-EXPORT_SYMBOL(tcp4_gro_receive);
 
 int tcp4_gro_complete(struct sk_buff *skb)
 {
@@ -2500,7 +2587,6 @@ int tcp4_gro_complete(struct sk_buff *skb)
 
 	return tcp_gro_complete(skb);
 }
-EXPORT_SYMBOL(tcp4_gro_complete);
 
 struct proto tcp_prot = {
 	.name			= "TCP",
@@ -2516,6 +2602,8 @@ struct proto tcp_prot = {
 	.setsockopt		= tcp_setsockopt,
 	.getsockopt		= tcp_getsockopt,
 	.recvmsg		= tcp_recvmsg,
+	.sendmsg		= tcp_sendmsg,
+	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v4_do_rcv,
 	.hash			= inet_hash,
 	.unhash			= inet_unhash,
@@ -2534,11 +2622,13 @@ struct proto tcp_prot = {
 	.twsk_prot		= &tcp_timewait_sock_ops,
 	.rsk_prot		= &tcp_request_sock_ops,
 	.h.hashinfo		= &tcp_hashinfo,
+	.no_autobind		= true,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt	= compat_tcp_setsockopt,
 	.compat_getsockopt	= compat_tcp_getsockopt,
 #endif
 };
+EXPORT_SYMBOL(tcp_prot);
 
 
 static int __net_init tcp_sk_init(struct net *net)
@@ -2569,20 +2659,3 @@ void __init tcp_v4_init(void)
 	if (register_pernet_subsys(&tcp_sk_ops))
 		panic("Failed to create the TCP control socket.\n");
 }
-
-EXPORT_SYMBOL(ipv4_specific);
-EXPORT_SYMBOL(tcp_hashinfo);
-EXPORT_SYMBOL(tcp_prot);
-EXPORT_SYMBOL(tcp_v4_conn_request);
-EXPORT_SYMBOL(tcp_v4_connect);
-EXPORT_SYMBOL(tcp_v4_do_rcv);
-EXPORT_SYMBOL(tcp_v4_remember_stamp);
-EXPORT_SYMBOL(tcp_v4_send_check);
-EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
-
-#ifdef CONFIG_PROC_FS
-EXPORT_SYMBOL(tcp_proc_register);
-EXPORT_SYMBOL(tcp_proc_unregister);
-#endif
-EXPORT_SYMBOL(sysctl_tcp_low_latency);
-
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 794c2e122a4..43cf901d765 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -47,7 +47,6 @@ struct inet_timewait_death_row tcp_death_row = {
 	.twcal_timer	= TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
 					    (unsigned long)&tcp_death_row),
 };
-
 EXPORT_SYMBOL_GPL(tcp_death_row);
 
 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
@@ -56,7 +55,7 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 		return 1;
 	if (after(end_seq, s_win) && before(seq, e_win))
 		return 1;
-	return (seq == e_win && seq == end_seq);
+	return seq == e_win && seq == end_seq;
 }
 
 /*
@@ -262,6 +261,7 @@ kill:
 	inet_twsk_put(tw);
 	return TCP_TW_SUCCESS;
 }
+EXPORT_SYMBOL(tcp_timewait_state_process);
 
 /*
  * Move a socket to time-wait or dead fin-wait-2 state.
@@ -362,7 +362,6 @@ void tcp_twsk_destructor(struct sock *sk)
 		tcp_free_md5sig_pool();
 #endif
 }
-
 EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
 
 static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
@@ -510,6 +509,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 	}
 	return newsk;
 }
+EXPORT_SYMBOL(tcp_create_openreq_child);
 
 /*
  *	Process an incoming packet for SYN_RECV sockets represented
@@ -706,6 +706,7 @@ embryonic_reset:
 	inet_csk_reqsk_queue_drop(sk, req, prev);
 	return NULL;
 }
+EXPORT_SYMBOL(tcp_check_req);
 
 /*
  * Queue segment on the new socket if the new socket is active,
@@ -737,8 +738,4 @@ int tcp_child_process(struct sock *parent, struct sock *child,
 	sock_put(child);
 	return ret;
 }
-
-EXPORT_SYMBOL(tcp_check_req);
 EXPORT_SYMBOL(tcp_child_process);
-EXPORT_SYMBOL(tcp_create_openreq_child);
-EXPORT_SYMBOL(tcp_timewait_state_process);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7ed9dc1042d..05b1ecf3676 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -224,16 +224,10 @@ void tcp_select_initial_window(int __space, __u32 mss,
 		}
 	}
 
-	/* Set initial window to value enough for senders,
-	 * following RFC2414. Senders, not following this RFC,
-	 * will be satisfied with 2.
-	 */
+	/* Set initial window to value enough for senders, following RFC5681. */
 	if (mss > (1 << *rcv_wscale)) {
-		int init_cwnd = 4;
-		if (mss > 1460 * 3)
-			init_cwnd = 2;
-		else if (mss > 1460)
-			init_cwnd = 3;
+		int init_cwnd = rfc3390_bytes_to_packets(mss);
+
 		/* when initializing use the value from init_rcv_wnd
 		 * rather than the default from above
 		 */
@@ -247,6 +241,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
 	/* Set the clamp no higher than max representable value */
 	(*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
 }
+EXPORT_SYMBOL(tcp_select_initial_window);
 
 /* Chose a new window to advertise, update state in tcp_sock for the
  * socket, and return result with RFC1323 scaling applied.  The return
@@ -294,9 +289,9 @@ static u16 tcp_select_window(struct sock *sk)
 /* Packet ECN state for a SYN-ACK */
 static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb)
 {
-	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR;
+	TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR;
 	if (!(tp->ecn_flags & TCP_ECN_OK))
-		TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE;
+		TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE;
 }
 
 /* Packet ECN state for a SYN.  */
@@ -306,7 +301,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
 
 	tp->ecn_flags = 0;
 	if (sysctl_tcp_ecn == 1) {
-		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR;
+		TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR;
 		tp->ecn_flags = TCP_ECN_OK;
 	}
 }
@@ -361,7 +356,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
 	skb_shinfo(skb)->gso_type = 0;
 
 	TCP_SKB_CB(skb)->seq = seq;
-	if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN))
+	if (flags & (TCPHDR_SYN | TCPHDR_FIN))
 		seq++;
 	TCP_SKB_CB(skb)->end_seq = seq;
 }
@@ -820,7 +815,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	tcb = TCP_SKB_CB(skb);
 	memset(&opts, 0, sizeof(opts));
 
-	if (unlikely(tcb->flags & TCPCB_FLAG_SYN))
+	if (unlikely(tcb->flags & TCPHDR_SYN))
 		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
 	else
 		tcp_options_size = tcp_established_options(sk, skb, &opts,
@@ -843,7 +838,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
 					tcb->flags);
 
-	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+	if (unlikely(tcb->flags & TCPHDR_SYN)) {
 		/* RFC1323: The window in SYN & SYN/ACK segments
 		 * is never scaled.
 		 */
@@ -866,7 +861,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	}
 
 	tcp_options_write((__be32 *)(th + 1), tp, &opts);
-	if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0))
+	if (likely((tcb->flags & TCPHDR_SYN) == 0))
 		TCP_ECN_send(sk, skb, tcp_header_size);
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -880,7 +875,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	icsk->icsk_af_ops->send_check(sk, skb);
 
-	if (likely(tcb->flags & TCPCB_FLAG_ACK))
+	if (likely(tcb->flags & TCPHDR_ACK))
 		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
 
 	if (skb->len != tcp_header_size)
@@ -1023,7 +1018,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 
 	/* PSH and FIN should only be set in the second packet. */
 	flags = TCP_SKB_CB(skb)->flags;
-	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
+	TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
 	TCP_SKB_CB(buff)->flags = flags;
 	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
 
@@ -1189,6 +1184,7 @@ void tcp_mtup_init(struct sock *sk)
 	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
 	icsk->icsk_mtup.probe_size = 0;
 }
+EXPORT_SYMBOL(tcp_mtup_init);
 
 /* This function synchronize snd mss to current pmtu/exthdr set.
 
@@ -1232,6 +1228,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 
 	return mss_now;
 }
+EXPORT_SYMBOL(tcp_sync_mss);
 
 /* Compute the current effective MSS, taking SACKs and IP options,
  * and even PMTU discovery events into account.
@@ -1328,8 +1325,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
 	u32 in_flight, cwnd;
 
 	/* Don't be strict about the congestion window for the final FIN.  */
-	if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
-	    tcp_skb_pcount(skb) == 1)
+	if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1)
 		return 1;
 
 	in_flight = tcp_packets_in_flight(tp);
@@ -1374,9 +1370,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp,
 				  const struct sk_buff *skb,
 				  unsigned mss_now, int nonagle)
 {
-	return (skb->len < mss_now &&
+	return skb->len < mss_now &&
 		((nonagle & TCP_NAGLE_CORK) ||
-		 (!nonagle && tp->packets_out && tcp_minshall_check(tp))));
+		 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
 }
 
 /* Return non-zero if the Nagle test allows this packet to be
@@ -1398,7 +1394,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
 	 * Nagle can be ignored during F-RTO too (see RFC4138).
 	 */
 	if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
-	    (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
+	    (TCP_SKB_CB(skb)->flags & TCPHDR_FIN))
 		return 1;
 
 	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
@@ -1447,10 +1443,10 @@ int tcp_may_send_now(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb = tcp_send_head(sk);
 
-	return (skb &&
+	return skb &&
 		tcp_snd_test(sk, skb, tcp_current_mss(sk),
 			     (tcp_skb_is_last(sk, skb) ?
-			      tp->nonagle : TCP_NAGLE_PUSH)));
+			      tp->nonagle : TCP_NAGLE_PUSH));
 }
 
 /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
@@ -1461,7 +1457,7 @@ int tcp_may_send_now(struct sock *sk)
  * packet has never been sent out before (and thus is not cloned).
  */
 static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
-			unsigned int mss_now)
+			unsigned int mss_now, gfp_t gfp)
 {
 	struct sk_buff *buff;
 	int nlen = skb->len - len;
@@ -1471,7 +1467,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 	if (skb->len != skb->data_len)
 		return tcp_fragment(sk, skb, len, mss_now);
 
-	buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC);
+	buff = sk_stream_alloc_skb(sk, 0, gfp);
 	if (unlikely(buff == NULL))
 		return -ENOMEM;
 
@@ -1487,7 +1483,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 
 	/* PSH and FIN should only be set in the second packet. */
 	flags = TCP_SKB_CB(skb)->flags;
-	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
+	TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
 	TCP_SKB_CB(buff)->flags = flags;
 
 	/* This packet was never sent out yet, so no SACK bits. */
@@ -1518,7 +1514,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 send_win, cong_win, limit, in_flight;
 
-	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+	if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)
 		goto send_now;
 
 	if (icsk->icsk_ca_state != TCP_CA_Open)
@@ -1644,7 +1640,7 @@ static int tcp_mtu_probe(struct sock *sk)
 
 	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
 	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
-	TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
+	TCP_SKB_CB(nskb)->flags = TCPHDR_ACK;
 	TCP_SKB_CB(nskb)->sacked = 0;
 	nskb->csum = 0;
 	nskb->ip_summed = skb->ip_summed;
@@ -1669,7 +1665,7 @@ static int tcp_mtu_probe(struct sock *sk)
 			sk_wmem_free_skb(sk, skb);
 		} else {
 			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
-						   ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+						   ~(TCPHDR_FIN|TCPHDR_PSH);
 			if (!skb_shinfo(skb)->nr_frags) {
 				skb_pull(skb, copy);
 				if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -1769,7 +1765,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 						    cwnd_quota);
 
 		if (skb->len > limit &&
-		    unlikely(tso_fragment(sk, skb, limit, mss_now)))
+		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
 			break;
 
 		TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -2020,7 +2016,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
 
 	if (!sysctl_tcp_retrans_collapse)
 		return;
-	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)
+	if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN)
 		return;
 
 	tcp_for_write_queue_from_safe(skb, tmp, sk) {
@@ -2112,7 +2108,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	 * since it is cheap to do so and saves bytes on the network.
 	 */
 	if (skb->len > 0 &&
-	    (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
+	    (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) &&
 	    tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
 		if (!pskb_trim(skb, 0)) {
 			/* Reuse, even though it does some unnecessary work */
@@ -2304,7 +2300,7 @@ void tcp_send_fin(struct sock *sk)
 	mss_now = tcp_current_mss(sk);
 
 	if (tcp_send_head(sk) != NULL) {
-		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
+		TCP_SKB_CB(skb)->flags |= TCPHDR_FIN;
 		TCP_SKB_CB(skb)->end_seq++;
 		tp->write_seq++;
 	} else {
@@ -2321,7 +2317,7 @@ void tcp_send_fin(struct sock *sk)
 		skb_reserve(skb, MAX_TCP_HEADER);
 		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
 		tcp_init_nondata_skb(skb, tp->write_seq,
-				     TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
+				     TCPHDR_ACK | TCPHDR_FIN);
 		tcp_queue_skb(sk, skb);
 	}
 	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
@@ -2346,7 +2342,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
 	/* Reserve space for headers and prepare control bits. */
 	skb_reserve(skb, MAX_TCP_HEADER);
 	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
-			     TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
+			     TCPHDR_ACK | TCPHDR_RST);
 	/* Send it off. */
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
 	if (tcp_transmit_skb(sk, skb, 0, priority))
@@ -2366,11 +2362,11 @@ int tcp_send_synack(struct sock *sk)
 	struct sk_buff *skb;
 
 	skb = tcp_write_queue_head(sk);
-	if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) {
+	if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) {
 		printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
 		return -EFAULT;
 	}
-	if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_ACK)) {
+	if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) {
 		if (skb_cloned(skb)) {
 			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
 			if (nskb == NULL)
@@ -2384,7 +2380,7 @@ int tcp_send_synack(struct sock *sk)
 			skb = nskb;
 		}
 
-		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
+		TCP_SKB_CB(skb)->flags |= TCPHDR_ACK;
 		TCP_ECN_send_synack(tcp_sk(sk), skb);
 	}
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -2427,6 +2423,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 		__u8 rcv_wscale;
 		/* Set this up on the first call only */
 		req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+
+		/* limit the window selection if the user enforce a smaller rx buffer */
+		if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+		    (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
+			req->window_clamp = tcp_full_space(sk);
+
 		/* tcp_full_space because it is guaranteed to be the first packet */
 		tcp_select_initial_window(tcp_full_space(sk),
 			mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -2463,7 +2465,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	 * not even correctly set)
 	 */
 	tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
-			     TCPCB_FLAG_SYN | TCPCB_FLAG_ACK);
+			     TCPHDR_SYN | TCPHDR_ACK);
 
 	if (OPTION_COOKIE_EXTENSION & opts.options) {
 		if (s_data_desired) {
@@ -2518,6 +2520,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 
 	return skb;
 }
+EXPORT_SYMBOL(tcp_make_synack);
 
 /* Do all connect socket setups that can be done AF independent. */
 static void tcp_connect_init(struct sock *sk)
@@ -2552,6 +2555,11 @@ static void tcp_connect_init(struct sock *sk)
 
 	tcp_initialize_rcv_mss(sk);
 
+	/* limit the window selection if the user enforce a smaller rx buffer */
+	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+	    (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
+		tp->window_clamp = tcp_full_space(sk);
+
 	tcp_select_initial_window(tcp_full_space(sk),
 				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
 				  &tp->rcv_wnd,
@@ -2595,7 +2603,7 @@ int tcp_connect(struct sock *sk)
 	skb_reserve(buff, MAX_TCP_HEADER);
 
 	tp->snd_nxt = tp->write_seq;
-	tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN);
+	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
 	TCP_ECN_send_syn(sk, buff);
 
 	/* Send it off. */
@@ -2620,6 +2628,7 @@ int tcp_connect(struct sock *sk)
 				  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
 	return 0;
 }
+EXPORT_SYMBOL(tcp_connect);
 
 /* Send out a delayed ack, the caller does the policy checking
  * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
@@ -2701,7 +2710,7 @@ void tcp_send_ack(struct sock *sk)
 
 	/* Reserve space for headers and prepare control bits. */
 	skb_reserve(buff, MAX_TCP_HEADER);
-	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK);
+	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
 
 	/* Send it off, this clears delayed acks for us. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -2735,7 +2744,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
 	 * end to send an ack.  Don't queue or clone SKB, just
 	 * send it.
 	 */
-	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK);
+	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
 	return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
 }
@@ -2765,13 +2774,13 @@ int tcp_write_wakeup(struct sock *sk)
 		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
 		    skb->len > mss) {
 			seg_size = min(seg_size, mss);
-			TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+			TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
 			if (tcp_fragment(sk, skb, seg_size, mss))
 				return -1;
 		} else if (!tcp_skb_pcount(skb))
 			tcp_set_skb_tso_segs(sk, skb, mss);
 
-		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+		TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
 		TCP_SKB_CB(skb)->when = tcp_time_stamp;
 		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 		if (!err)
@@ -2824,10 +2833,3 @@ void tcp_send_probe0(struct sock *sk)
 					  TCP_RTO_MAX);
 	}
 }
-
-EXPORT_SYMBOL(tcp_select_initial_window);
-EXPORT_SYMBOL(tcp_connect);
-EXPORT_SYMBOL(tcp_make_synack);
-EXPORT_SYMBOL(tcp_simple_retransmit);
-EXPORT_SYMBOL(tcp_sync_mss);
-EXPORT_SYMBOL(tcp_mtup_init);
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index f8efada580e..6211e211417 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -214,6 +214,7 @@ static const struct file_operations tcpprobe_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = tcpprobe_open,
 	.read    = tcpprobe_read,
+	.llseek  = noop_llseek,
 };
 
 static __init int tcpprobe_init(void)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 440a5c6004f..74a6aa00365 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -41,7 +41,6 @@ void tcp_init_xmit_timers(struct sock *sk)
 	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
 				  &tcp_keepalive_timer);
 }
-
 EXPORT_SYMBOL(tcp_init_xmit_timers);
 
 static void tcp_write_err(struct sock *sk)
@@ -67,18 +66,18 @@ static void tcp_write_err(struct sock *sk)
 static int tcp_out_of_resources(struct sock *sk, int do_reset)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	int orphans = percpu_counter_read_positive(&tcp_orphan_count);
+	int shift = 0;
 
 	/* If peer does not open window for long time, or did not transmit
 	 * anything for long time, penalize it. */
 	if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
-		orphans <<= 1;
+		shift++;
 
 	/* If some dubious ICMP arrived, penalize even more. */
 	if (sk->sk_err_soft)
-		orphans <<= 1;
+		shift++;
 
-	if (tcp_too_many_orphans(sk, orphans)) {
+	if (tcp_too_many_orphans(sk, shift)) {
 		if (net_ratelimit())
 			printk(KERN_INFO "Out of socket memory\n");
 
@@ -136,13 +135,16 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
 
 /* This function calculates a "timeout" which is equivalent to the timeout of a
  * TCP connection after "boundary" unsuccessful, exponentially backed-off
- * retransmissions with an initial RTO of TCP_RTO_MIN.
+ * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
+ * syn_set flag is set.
  */
 static bool retransmits_timed_out(struct sock *sk,
-				  unsigned int boundary)
+				  unsigned int boundary,
+				  unsigned int timeout,
+				  bool syn_set)
 {
-	unsigned int timeout, linear_backoff_thresh;
-	unsigned int start_ts;
+	unsigned int linear_backoff_thresh, start_ts;
+	unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
 
 	if (!inet_csk(sk)->icsk_retransmits)
 		return false;
@@ -152,14 +154,15 @@ static bool retransmits_timed_out(struct sock *sk,
 	else
 		start_ts = tcp_sk(sk)->retrans_stamp;
 
-	linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN);
-
-	if (boundary <= linear_backoff_thresh)
-		timeout = ((2 << boundary) - 1) * TCP_RTO_MIN;
-	else
-		timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN +
-			  (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+	if (likely(timeout == 0)) {
+		linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
 
+		if (boundary <= linear_backoff_thresh)
+			timeout = ((2 << boundary) - 1) * rto_base;
+		else
+			timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
+				(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+	}
 	return (tcp_time_stamp - start_ts) >= timeout;
 }
 
@@ -168,14 +171,15 @@ static int tcp_write_timeout(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	int retry_until;
-	bool do_reset;
+	bool do_reset, syn_set = 0;
 
 	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 		if (icsk->icsk_retransmits)
 			dst_negative_advice(sk);
 		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+		syn_set = 1;
 	} else {
-		if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
+		if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
 			/* Black hole detection */
 			tcp_mtu_probing(icsk, sk);
 
@@ -188,14 +192,15 @@ static int tcp_write_timeout(struct sock *sk)
 
 			retry_until = tcp_orphan_retries(sk, alive);
 			do_reset = alive ||
-				   !retransmits_timed_out(sk, retry_until);
+				!retransmits_timed_out(sk, retry_until, 0, 0);
 
 			if (tcp_out_of_resources(sk, do_reset))
 				return 1;
 		}
 	}
 
-	if (retransmits_timed_out(sk, retry_until)) {
+	if (retransmits_timed_out(sk, retry_until,
+				  syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) {
 		/* Has it gone just too far? */
 		tcp_write_err(sk);
 		return 1;
@@ -362,18 +367,19 @@ void tcp_retransmit_timer(struct sock *sk)
 	if (icsk->icsk_retransmits == 0) {
 		int mib_idx;
 
-		if (icsk->icsk_ca_state == TCP_CA_Disorder) {
-			if (tcp_is_sack(tp))
-				mib_idx = LINUX_MIB_TCPSACKFAILURES;
-			else
-				mib_idx = LINUX_MIB_TCPRENOFAILURES;
-		} else if (icsk->icsk_ca_state == TCP_CA_Recovery) {
+		if (icsk->icsk_ca_state == TCP_CA_Recovery) {
 			if (tcp_is_sack(tp))
 				mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
 			else
 				mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
 		} else if (icsk->icsk_ca_state == TCP_CA_Loss) {
 			mib_idx = LINUX_MIB_TCPLOSSFAILURES;
+		} else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
+			   tp->sacked_out) {
+			if (tcp_is_sack(tp))
+				mib_idx = LINUX_MIB_TCPSACKFAILURES;
+			else
+				mib_idx = LINUX_MIB_TCPRENOFAILURES;
 		} else {
 			mib_idx = LINUX_MIB_TCPTIMEOUTS;
 		}
@@ -437,7 +443,7 @@ out_reset_timer:
 		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
 	}
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
-	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
+	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
 		__sk_dst_reset(sk);
 
 out:;
@@ -557,7 +563,14 @@ static void tcp_keepalive_timer (unsigned long data)
 	elapsed = keepalive_time_elapsed(tp);
 
 	if (elapsed >= keepalive_time_when(tp)) {
-		if (icsk->icsk_probes_out >= keepalive_probes(tp)) {
+		/* If the TCP_USER_TIMEOUT option is enabled, use that
+		 * to determine when to timeout instead.
+		 */
+		if ((icsk->icsk_user_timeout != 0 &&
+		    elapsed >= icsk->icsk_user_timeout &&
+		    icsk->icsk_probes_out > 0) ||
+		    (icsk->icsk_user_timeout == 0 &&
+		    icsk->icsk_probes_out >= keepalive_probes(tp))) {
 			tcp_send_active_reset(sk, GFP_ATOMIC);
 			tcp_write_err(sk);
 			goto out;
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index b612acf7618..38bc0b52d74 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -6,7 +6,7 @@
  *    "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
  *    IEEE Journal on Selected Areas in Communication,
  *    Feb. 2003.
- * 	See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf
+ * 	See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf
  */
 
 #include <linux/mm.h>
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 20151d6a624..a534dda5456 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -80,7 +80,7 @@ static void tcp_westwood_init(struct sock *sk)
  */
 static inline u32 westwood_do_filter(u32 a, u32 b)
 {
-	return (((7 * a) + b) >> 3);
+	return ((7 * a) + b) >> 3;
 }
 
 static void westwood_filter(struct westwood *w, u32 delta)
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 3b3813cc80b..ac3b3ee4b07 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -14,32 +14,37 @@
 #include <net/protocol.h>
 #include <net/xfrm.h>
 
-static struct xfrm_tunnel *tunnel4_handlers;
-static struct xfrm_tunnel *tunnel64_handlers;
+static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly;
+static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly;
 static DEFINE_MUTEX(tunnel4_mutex);
 
-static inline struct xfrm_tunnel **fam_handlers(unsigned short family)
+static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family)
 {
 	return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers;
 }
 
 int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
 {
-	struct xfrm_tunnel **pprev;
+	struct xfrm_tunnel __rcu **pprev;
+	struct xfrm_tunnel *t;
+
 	int ret = -EEXIST;
 	int priority = handler->priority;
 
 	mutex_lock(&tunnel4_mutex);
 
-	for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) {
-		if ((*pprev)->priority > priority)
+	for (pprev = fam_handlers(family);
+	     (t = rcu_dereference_protected(*pprev,
+			lockdep_is_held(&tunnel4_mutex))) != NULL;
+	     pprev = &t->next) {
+		if (t->priority > priority)
 			break;
-		if ((*pprev)->priority == priority)
+		if (t->priority == priority)
 			goto err;
 	}
 
 	handler->next = *pprev;
-	*pprev = handler;
+	rcu_assign_pointer(*pprev, handler);
 
 	ret = 0;
 
@@ -48,18 +53,21 @@ err:
 
 	return ret;
 }
-
 EXPORT_SYMBOL(xfrm4_tunnel_register);
 
 int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
 {
-	struct xfrm_tunnel **pprev;
+	struct xfrm_tunnel __rcu **pprev;
+	struct xfrm_tunnel *t;
 	int ret = -ENOENT;
 
 	mutex_lock(&tunnel4_mutex);
 
-	for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) {
-		if (*pprev == handler) {
+	for (pprev = fam_handlers(family);
+	     (t = rcu_dereference_protected(*pprev,
+			lockdep_is_held(&tunnel4_mutex))) != NULL;
+	     pprev = &t->next) {
+		if (t == handler) {
 			*pprev = handler->next;
 			ret = 0;
 			break;
@@ -72,9 +80,13 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
 
 	return ret;
 }
-
 EXPORT_SYMBOL(xfrm4_tunnel_deregister);
 
+#define for_each_tunnel_rcu(head, handler)		\
+	for (handler = rcu_dereference(head);		\
+	     handler != NULL;				\
+	     handler = rcu_dereference(handler->next))	\
+	
 static int tunnel4_rcv(struct sk_buff *skb)
 {
 	struct xfrm_tunnel *handler;
@@ -82,7 +94,7 @@ static int tunnel4_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
 		goto drop;
 
-	for (handler = tunnel4_handlers; handler; handler = handler->next)
+	for_each_tunnel_rcu(tunnel4_handlers, handler)
 		if (!handler->handler(skb))
 			return 0;
 
@@ -101,7 +113,7 @@ static int tunnel64_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
 		goto drop;
 
-	for (handler = tunnel64_handlers; handler; handler = handler->next)
+	for_each_tunnel_rcu(tunnel64_handlers, handler)
 		if (!handler->handler(skb))
 			return 0;
 
@@ -117,7 +129,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm_tunnel *handler;
 
-	for (handler = tunnel4_handlers; handler; handler = handler->next)
+	for_each_tunnel_rcu(tunnel4_handlers, handler)
 		if (!handler->err_handler(skb, info))
 			break;
 }
@@ -127,7 +139,7 @@ static void tunnel64_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm_tunnel *handler;
 
-	for (handler = tunnel64_handlers; handler; handler = handler->next)
+	for_each_tunnel_rcu(tunnel64_handlers, handler)
 		if (!handler->err_handler(skb, info))
 			break;
 }
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index eec4ff456e3..28cb2d733a3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -797,7 +797,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		return -EOPNOTSUPP;
 
 	ipc.opt = NULL;
-	ipc.shtx.flags = 0;
+	ipc.tx_flags = 0;
 
 	if (up->pending) {
 		/*
@@ -845,7 +845,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	ipc.addr = inet->inet_saddr;
 
 	ipc.oif = sk->sk_bound_dev_if;
-	err = sock_tx_timestamp(msg, sk, &ipc.shtx);
+	err = sock_tx_timestamp(sk, &ipc.tx_flags);
 	if (err)
 		return err;
 	if (msg->msg_controllen) {
@@ -914,7 +914,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		    !sock_flag(sk, SOCK_BROADCAST))
 			goto out;
 		if (connected)
-			sk_dst_set(sk, dst_clone(&rt->u.dst));
+			sk_dst_set(sk, dst_clone(&rt->dst));
 	}
 
 	if (msg->msg_flags&MSG_CONFIRM)
@@ -978,7 +978,7 @@ out:
 	return err;
 
 do_confirm:
-	dst_confirm(&rt->u.dst);
+	dst_confirm(&rt->dst);
 	if (!(msg->msg_flags&MSG_PROBE) || len)
 		goto back_from_confirm;
 	err = 0;
@@ -1260,6 +1260,49 @@ void udp_lib_unhash(struct sock *sk)
 }
 EXPORT_SYMBOL(udp_lib_unhash);
 
+/*
+ * inet_rcv_saddr was changed, we must rehash secondary hash
+ */
+void udp_lib_rehash(struct sock *sk, u16 newhash)
+{
+	if (sk_hashed(sk)) {
+		struct udp_table *udptable = sk->sk_prot->h.udp_table;
+		struct udp_hslot *hslot, *hslot2, *nhslot2;
+
+		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+		nhslot2 = udp_hashslot2(udptable, newhash);
+		udp_sk(sk)->udp_portaddr_hash = newhash;
+		if (hslot2 != nhslot2) {
+			hslot = udp_hashslot(udptable, sock_net(sk),
+					     udp_sk(sk)->udp_port_hash);
+			/* we must lock primary chain too */
+			spin_lock_bh(&hslot->lock);
+
+			spin_lock(&hslot2->lock);
+			hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+			hslot2->count--;
+			spin_unlock(&hslot2->lock);
+
+			spin_lock(&nhslot2->lock);
+			hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+						 &nhslot2->head);
+			nhslot2->count++;
+			spin_unlock(&nhslot2->lock);
+
+			spin_unlock_bh(&hslot->lock);
+		}
+	}
+}
+EXPORT_SYMBOL(udp_lib_rehash);
+
+static void udp_v4_rehash(struct sock *sk)
+{
+	u16 new_hash = udp4_portaddr_hash(sock_net(sk),
+					  inet_sk(sk)->inet_rcv_saddr,
+					  inet_sk(sk)->inet_num);
+	udp_lib_rehash(sk, new_hash);
+}
+
 static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	int rc;
@@ -1370,7 +1413,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		}
 	}
 
-	if (sk->sk_filter) {
+	if (rcu_dereference_raw(sk->sk_filter)) {
 		if (udp_lib_checksum_complete(skb))
 			goto drop;
 	}
@@ -1843,6 +1886,7 @@ struct proto udp_prot = {
 	.backlog_rcv	   = __udp_queue_rcv_skb,
 	.hash		   = udp_lib_hash,
 	.unhash		   = udp_lib_unhash,
+	.rehash		   = udp_v4_rehash,
 	.get_port	   = udp_v4_get_port,
 	.memory_allocated  = &udp_memory_allocated,
 	.sysctl_mem	   = sysctl_udp_mem,
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 6610bf76369..ab76aa928fa 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -58,6 +58,7 @@ struct proto 	udplite_prot = {
 	.compat_getsockopt = compat_udp_getsockopt,
 #endif
 };
+EXPORT_SYMBOL(udplite_prot);
 
 static struct inet_protosw udplite4_protosw = {
 	.type		=  SOCK_DGRAM,
@@ -127,5 +128,3 @@ out_unregister_proto:
 out_register_err:
 	printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
 }
-
-EXPORT_SYMBOL(udplite_prot);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index ad8fbb871aa..06814b6216d 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -163,5 +163,4 @@ int xfrm4_rcv(struct sk_buff *skb)
 {
 	return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0);
 }
-
 EXPORT_SYMBOL(xfrm4_rcv);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 23883a48ebf..4464f3bff6a 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -37,7 +37,7 @@ static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
 		fl.fl4_src = saddr->a4;
 
 	err = __ip_route_output_key(net, &rt, &fl);
-	dst = &rt->u.dst;
+	dst = &rt->dst;
 	if (err)
 		dst = ERR_PTR(err);
 	return dst;
@@ -61,7 +61,7 @@ static int xfrm4_get_saddr(struct net *net,
 
 static int xfrm4_get_tos(struct flowi *fl)
 {
-	return fl->fl4_tos;
+	return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */
 }
 
 static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
@@ -174,7 +174,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
 	struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
 
 	xfrm4_policy_afinfo.garbage_collect(net);
-	return (atomic_read(&ops->entries) > ops->gc_thresh * 2);
+	return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
 }
 
 static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -232,7 +232,6 @@ static struct dst_ops xfrm4_dst_ops = {
 	.ifdown =		xfrm4_dst_ifdown,
 	.local_out =		__ip_local_out,
 	.gc_thresh =		1024,
-	.entries =		ATOMIC_INIT(0),
 };
 
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
@@ -288,6 +287,7 @@ void __init xfrm4_init(int rt_max_size)
 	 * and start cleaning when were 1/2 full
 	 */
 	xfrm4_dst_ops.gc_thresh = rt_max_size/2;
+	dst_entries_init(&xfrm4_dst_ops);
 
 	xfrm4_state_init();
 	xfrm4_policy_init();
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 1ef1366a0a0..47947624ecc 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -21,21 +21,25 @@ static int xfrm4_init_flags(struct xfrm_state *x)
 }
 
 static void
-__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
-		     struct xfrm_tmpl *tmpl,
-		     xfrm_address_t *daddr, xfrm_address_t *saddr)
+__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
+{
+	sel->daddr.a4 = fl->fl4_dst;
+	sel->saddr.a4 = fl->fl4_src;
+	sel->dport = xfrm_flowi_dport(fl);
+	sel->dport_mask = htons(0xffff);
+	sel->sport = xfrm_flowi_sport(fl);
+	sel->sport_mask = htons(0xffff);
+	sel->family = AF_INET;
+	sel->prefixlen_d = 32;
+	sel->prefixlen_s = 32;
+	sel->proto = fl->proto;
+	sel->ifindex = fl->oif;
+}
+
+static void
+xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
+		   xfrm_address_t *daddr, xfrm_address_t *saddr)
 {
-	x->sel.daddr.a4 = fl->fl4_dst;
-	x->sel.saddr.a4 = fl->fl4_src;
-	x->sel.dport = xfrm_flowi_dport(fl);
-	x->sel.dport_mask = htons(0xffff);
-	x->sel.sport = xfrm_flowi_sport(fl);
-	x->sel.sport_mask = htons(0xffff);
-	x->sel.family = AF_INET;
-	x->sel.prefixlen_d = 32;
-	x->sel.prefixlen_s = 32;
-	x->sel.proto = fl->proto;
-	x->sel.ifindex = fl->oif;
 	x->id = tmpl->id;
 	if (x->id.daddr.a4 == 0)
 		x->id.daddr.a4 = daddr->a4;
@@ -70,6 +74,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.owner			= THIS_MODULE,
 	.init_flags		= xfrm4_init_flags,
 	.init_tempsel		= __xfrm4_init_tempsel,
+	.init_temprop		= xfrm4_init_temprop,
 	.output			= xfrm4_output,
 	.extract_input		= xfrm4_extract_input,
 	.extract_output		= xfrm4_extract_output,
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 41f5982d208..82806455e85 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -58,14 +58,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
 	return -ENOENT;
 }
 
-static struct xfrm_tunnel xfrm_tunnel_handler = {
+static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
 	.handler	=	xfrm_tunnel_rcv,
 	.err_handler	=	xfrm_tunnel_err,
 	.priority	=	2,
 };
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static struct xfrm_tunnel xfrm64_tunnel_handler = {
+static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
 	.handler	=	xfrm_tunnel_rcv,
 	.err_handler	=	xfrm_tunnel_err,
 	.priority	=	2,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 784f34d11fd..e048ec62d10 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -121,8 +121,6 @@ static inline void addrconf_sysctl_unregister(struct inet6_dev *idev)
 static int __ipv6_regen_rndid(struct inet6_dev *idev);
 static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr);
 static void ipv6_regen_rndid(unsigned long data);
-
-static int desync_factor = MAX_DESYNC_FACTOR * HZ;
 #endif
 
 static int ipv6_generate_eui64(u8 *eui, struct net_device *dev);
@@ -245,7 +243,7 @@ static inline bool addrconf_qdisc_ok(const struct net_device *dev)
 /* Check if a route is valid prefix route */
 static inline int addrconf_is_prefix_route(const struct rt6_info *rt)
 {
-	return ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0);
+	return (rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0;
 }
 
 static void addrconf_del_timer(struct inet6_ifaddr *ifp)
@@ -284,13 +282,16 @@ static void addrconf_mod_timer(struct inet6_ifaddr *ifp,
 static int snmp6_alloc_dev(struct inet6_dev *idev)
 {
 	if (snmp_mib_init((void __percpu **)idev->stats.ipv6,
-			  sizeof(struct ipstats_mib)) < 0)
+			  sizeof(struct ipstats_mib),
+			  __alignof__(struct ipstats_mib)) < 0)
 		goto err_ip;
 	if (snmp_mib_init((void __percpu **)idev->stats.icmpv6,
-			  sizeof(struct icmpv6_mib)) < 0)
+			  sizeof(struct icmpv6_mib),
+			  __alignof__(struct icmpv6_mib)) < 0)
 		goto err_icmp;
 	if (snmp_mib_init((void __percpu **)idev->stats.icmpv6msg,
-			  sizeof(struct icmpv6msg_mib)) < 0)
+			  sizeof(struct icmpv6msg_mib),
+			  __alignof__(struct icmpv6msg_mib)) < 0)
 		goto err_icmpmsg;
 
 	return 0;
@@ -557,7 +558,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
 		pr_warning("Freeing alive inet6 address %p\n", ifp);
 		return;
 	}
-	dst_release(&ifp->rt->u.dst);
+	dst_release(&ifp->rt->dst);
 
 	call_rcu(&ifp->rcu, inet6_ifa_finish_destroy_rcu);
 }
@@ -823,7 +824,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
 				rt->rt6i_flags |= RTF_EXPIRES;
 			}
 		}
-		dst_release(&rt->u.dst);
+		dst_release(&rt->dst);
 	}
 
 out:
@@ -835,7 +836,7 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *i
 {
 	struct inet6_dev *idev = ifp->idev;
 	struct in6_addr addr, *tmpaddr;
-	unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_cstamp, tmp_tstamp;
+	unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_cstamp, tmp_tstamp, age;
 	unsigned long regen_advance;
 	int tmp_plen;
 	int ret = 0;
@@ -885,12 +886,14 @@ retry:
 		goto out;
 	}
 	memcpy(&addr.s6_addr[8], idev->rndid, 8);
+	age = (jiffies - ifp->tstamp) / HZ;
 	tmp_valid_lft = min_t(__u32,
 			      ifp->valid_lft,
-			      idev->cnf.temp_valid_lft);
+			      idev->cnf.temp_valid_lft + age);
 	tmp_prefered_lft = min_t(__u32,
 				 ifp->prefered_lft,
-				 idev->cnf.temp_prefered_lft - desync_factor / HZ);
+				 idev->cnf.temp_prefered_lft + age -
+				 idev->cnf.max_desync_factor);
 	tmp_plen = ifp->prefix_len;
 	max_addresses = idev->cnf.max_addresses;
 	tmp_cstamp = ifp->cstamp;
@@ -1424,8 +1427,10 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp)
 {
 	struct inet6_dev *idev = ifp->idev;
 
-	if (addrconf_dad_end(ifp))
+	if (addrconf_dad_end(ifp)) {
+		in6_ifa_put(ifp);
 		return;
+	}
 
 	if (net_ratelimit())
 		printk(KERN_INFO "%s: IPv6 duplicate address %pI6c detected!\n",
@@ -1542,7 +1547,7 @@ static int addrconf_ifid_infiniband(u8 *eui, struct net_device *dev)
 	return 0;
 }
 
-int __ipv6_isatap_ifid(u8 *eui, __be32 addr)
+static int __ipv6_isatap_ifid(u8 *eui, __be32 addr)
 {
 	if (addr == 0)
 		return -1;
@@ -1558,7 +1563,6 @@ int __ipv6_isatap_ifid(u8 *eui, __be32 addr)
 	memcpy(eui + 4, &addr, 4);
 	return 0;
 }
-EXPORT_SYMBOL(__ipv6_isatap_ifid);
 
 static int addrconf_ifid_sit(u8 *eui, struct net_device *dev)
 {
@@ -1650,7 +1654,8 @@ static void ipv6_regen_rndid(unsigned long data)
 
 	expires = jiffies +
 		idev->cnf.temp_prefered_lft * HZ -
-		idev->cnf.regen_max_retry * idev->cnf.dad_transmits * idev->nd_parms->retrans_time - desync_factor;
+		idev->cnf.regen_max_retry * idev->cnf.dad_transmits * idev->nd_parms->retrans_time -
+		idev->cnf.max_desync_factor * HZ;
 	if (time_before(expires, jiffies)) {
 		printk(KERN_WARNING
 			"ipv6_regen_rndid(): too short regeneration interval; timer disabled for %s.\n",
@@ -1866,7 +1871,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
 					      dev, expires, flags);
 		}
 		if (rt)
-			dst_release(&rt->u.dst);
+			dst_release(&rt->dst);
 	}
 
 	/* Try to figure out our local address for this prefix */
@@ -2019,10 +2024,11 @@ ok:
 					ipv6_ifa_notify(0, ift);
 			}
 
-			if (create && in6_dev->cnf.use_tempaddr > 0) {
+			if ((create || list_empty(&in6_dev->tempaddr_list)) && in6_dev->cnf.use_tempaddr > 0) {
 				/*
 				 * When a new public address is created as described in [ADDRCONF],
-				 * also create a new temporary address.
+				 * also create a new temporary address. Also create a temporary
+				 * address if it's enabled but no temporary address currently exists.
 				 */
 				read_unlock_bh(&in6_dev->lock);
 				ipv6_create_tempaddr(ifp, NULL);
@@ -2961,7 +2967,8 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
 	   start sending router solicitations.
 	 */
 
-	if (ifp->idev->cnf.forwarding == 0 &&
+	if ((ifp->idev->cnf.forwarding == 0 ||
+	     ifp->idev->cnf.forwarding == 2) &&
 	    ifp->idev->cnf.rtr_solicits > 0 &&
 	    (dev->flags&IFF_LOOPBACK) == 0 &&
 	    (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) {
@@ -3496,8 +3503,12 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
 				preferred -= tval;
 			else
 				preferred = 0;
-			if (valid != INFINITY_LIFE_TIME)
-				valid -= tval;
+			if (valid != INFINITY_LIFE_TIME) {
+				if (valid > tval)
+					valid -= tval;
+				else
+					valid = 0;
+			}
 		}
 	} else {
 		preferred = INFINITY_LIFE_TIME;
@@ -3859,12 +3870,28 @@ static inline void __snmp6_fill_stats(u64 *stats, void __percpu **mib,
 	memset(&stats[items], 0, pad);
 }
 
+static inline void __snmp6_fill_stats64(u64 *stats, void __percpu **mib,
+				      int items, int bytes, size_t syncpoff)
+{
+	int i;
+	int pad = bytes - sizeof(u64) * items;
+	BUG_ON(pad < 0);
+
+	/* Use put_unaligned() because stats may not be aligned for u64. */
+	put_unaligned(items, &stats[0]);
+	for (i = 1; i < items; i++)
+		put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]);
+
+	memset(&stats[items], 0, pad);
+}
+
 static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
 			     int bytes)
 {
 	switch (attrtype) {
 	case IFLA_INET6_STATS:
-		__snmp6_fill_stats(stats, (void __percpu **)idev->stats.ipv6, IPSTATS_MIB_MAX, bytes);
+		__snmp6_fill_stats64(stats, (void __percpu **)idev->stats.ipv6,
+				     IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp));
 		break;
 	case IFLA_INET6_ICMP6STATS:
 		__snmp6_fill_stats(stats, (void __percpu **)idev->stats.icmpv6, ICMP6_MIB_MAX, bytes);
@@ -4097,11 +4124,11 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 		if (ifp->idev->cnf.forwarding)
 			addrconf_leave_anycast(ifp);
 		addrconf_leave_solict(ifp->idev, &ifp->addr);
-		dst_hold(&ifp->rt->u.dst);
+		dst_hold(&ifp->rt->dst);
 
 		if (ifp->state == INET6_IFADDR_STATE_DEAD &&
 		    ip6_del_rt(ifp->rt))
-			dst_free(&ifp->rt->u.dst);
+			dst_free(&ifp->rt->dst);
 		break;
 	}
 }
@@ -4614,10 +4641,12 @@ int __init addrconf_init(void)
 	if (err < 0) {
 		printk(KERN_CRIT "IPv6 Addrconf:"
 		       " cannot initialize default policy table: %d.\n", err);
-		return err;
+		goto out;
 	}
 
-	register_pernet_subsys(&addrconf_ops);
+	err = register_pernet_subsys(&addrconf_ops);
+	if (err < 0)
+		goto out_addrlabel;
 
 	/* The addrconf netdev notifier requires that loopback_dev
 	 * has it's ipv6 private information allocated and setup
@@ -4669,7 +4698,9 @@ errout:
 	unregister_netdevice_notifier(&ipv6_dev_notf);
 errlo:
 	unregister_pernet_subsys(&addrconf_ops);
-
+out_addrlabel:
+	ipv6_addr_label_cleanup();
+out:
 	return err;
 }
 
@@ -4680,6 +4711,7 @@ void addrconf_cleanup(void)
 
 	unregister_netdevice_notifier(&ipv6_dev_notf);
 	unregister_pernet_subsys(&addrconf_ops);
+	ipv6_addr_label_cleanup();
 
 	rtnl_lock();
 
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 8c4348cb195..c8993e5a337 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -53,11 +53,7 @@ static struct ip6addrlbl_table
 static inline
 struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl)
 {
-#ifdef CONFIG_NET_NS
-	return lbl->lbl_net;
-#else
-	return &init_net;
-#endif
+	return read_pnet(&lbl->lbl_net);
 }
 
 /*
@@ -397,6 +393,11 @@ int __init ipv6_addr_label_init(void)
 	return register_pernet_subsys(&ipv6_addr_label_ops);
 }
 
+void ipv6_addr_label_cleanup(void)
+{
+	unregister_pernet_subsys(&ipv6_addr_label_ops);
+}
+
 static const struct nla_policy ifal_policy[IFAL_MAX+1] = {
 	[IFAL_ADDRESS]		= { .len = sizeof(struct in6_addr), },
 	[IFAL_LABEL]		= { .len = sizeof(u32), },
@@ -517,10 +518,9 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
 
 static inline int ip6addrlbl_msgsize(void)
 {
-	return (NLMSG_ALIGN(sizeof(struct ifaddrlblmsg))
+	return NLMSG_ALIGN(sizeof(struct ifaddrlblmsg))
 		+ nla_total_size(16)	/* IFAL_ADDRESS */
-		+ nla_total_size(4)	/* IFAL_LABEL */
-	);
+		+ nla_total_size(4);	/* IFAL_LABEL */
 }
 
 static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e733942dafe..54e8e42f7a8 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -343,7 +343,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 			 */
 			v4addr = LOOPBACK4_IPV6;
 			if (!(addr_type & IPV6_ADDR_MULTICAST))	{
-				if (!ipv6_chk_addr(net, &addr->sin6_addr,
+				if (!inet->transparent &&
+				    !ipv6_chk_addr(net, &addr->sin6_addr,
 						   dev, 0)) {
 					err = -EADDRNOTAVAIL;
 					goto out_unlock;
@@ -467,7 +468,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 	if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL)
 		sin->sin6_scope_id = sk->sk_bound_dev_if;
 	*uaddr_len = sizeof(*sin);
-	return(0);
+	return 0;
 }
 
 EXPORT_SYMBOL(inet6_getname);
@@ -488,7 +489,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	case SIOCADDRT:
 	case SIOCDELRT:
 
-		return(ipv6_route_ioctl(net, cmd, (void __user *)arg));
+		return ipv6_route_ioctl(net, cmd, (void __user *)arg);
 
 	case SIOCSIFADDR:
 		return addrconf_add_ifaddr(net, (void __user *) arg);
@@ -502,7 +503,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		return sk->sk_prot->ioctl(sk, cmd, arg);
 	}
 	/*NOTREACHED*/
-	return(0);
+	return 0;
 }
 
 EXPORT_SYMBOL(inet6_ioctl);
@@ -522,10 +523,10 @@ const struct proto_ops inet6_stream_ops = {
 	.shutdown	   = inet_shutdown,		/* ok		*/
 	.setsockopt	   = sock_common_setsockopt,	/* ok		*/
 	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
-	.sendmsg	   = tcp_sendmsg,		/* ok		*/
-	.recvmsg	   = sock_common_recvmsg,	/* ok		*/
+	.sendmsg	   = inet_sendmsg,		/* ok		*/
+	.recvmsg	   = inet_recvmsg,		/* ok		*/
 	.mmap		   = sock_no_mmap,
-	.sendpage	   = tcp_sendpage,
+	.sendpage	   = inet_sendpage,
 	.splice_read	   = tcp_splice_read,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_sock_common_setsockopt,
@@ -549,7 +550,7 @@ const struct proto_ops inet6_dgram_ops = {
 	.setsockopt	   = sock_common_setsockopt,	/* ok		*/
 	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
 	.sendmsg	   = inet_sendmsg,		/* ok		*/
-	.recvmsg	   = sock_common_recvmsg,	/* ok		*/
+	.recvmsg	   = inet_recvmsg,		/* ok		*/
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = sock_no_sendpage,
 #ifdef CONFIG_COMPAT
@@ -651,7 +652,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
 
 	if (dst == NULL) {
 		struct inet_sock *inet = inet_sk(sk);
-		struct in6_addr *final_p = NULL, final;
+		struct in6_addr *final_p, final;
 		struct flowi fl;
 
 		memset(&fl, 0, sizeof(fl));
@@ -665,12 +666,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
 		fl.fl_ip_sport = inet->inet_sport;
 		security_sk_classify_flow(sk, &fl);
 
-		if (np->opt && np->opt->srcrt) {
-			struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
-			ipv6_addr_copy(&final, &fl.fl6_dst);
-			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-			final_p = &final;
-		}
+		final_p = fl6_update_dst(&fl, np->opt, &final);
 
 		err = ip6_dst_lookup(sk, &dst, &fl);
 		if (err) {
@@ -976,19 +972,24 @@ static void ipv6_packet_cleanup(void)
 static int __net_init ipv6_init_mibs(struct net *net)
 {
 	if (snmp_mib_init((void __percpu **)net->mib.udp_stats_in6,
-			  sizeof (struct udp_mib)) < 0)
+			  sizeof(struct udp_mib),
+			  __alignof__(struct udp_mib)) < 0)
 		return -ENOMEM;
 	if (snmp_mib_init((void __percpu **)net->mib.udplite_stats_in6,
-			  sizeof (struct udp_mib)) < 0)
+			  sizeof(struct udp_mib),
+			  __alignof__(struct udp_mib)) < 0)
 		goto err_udplite_mib;
 	if (snmp_mib_init((void __percpu **)net->mib.ipv6_statistics,
-			  sizeof(struct ipstats_mib)) < 0)
+			  sizeof(struct ipstats_mib),
+			  __alignof__(struct ipstats_mib)) < 0)
 		goto err_ip_mib;
 	if (snmp_mib_init((void __percpu **)net->mib.icmpv6_statistics,
-			  sizeof(struct icmpv6_mib)) < 0)
+			  sizeof(struct icmpv6_mib),
+			  __alignof__(struct icmpv6_mib)) < 0)
 		goto err_icmp_mib;
 	if (snmp_mib_init((void __percpu **)net->mib.icmpv6msg_statistics,
-			  sizeof(struct icmpv6msg_mib)) < 0)
+			  sizeof(struct icmpv6msg_mib),
+			  __alignof__(struct icmpv6msg_mib)) < 0)
 		goto err_icmpmsg_mib;
 	return 0;
 
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index b5b07054508..0e5e943446f 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -77,41 +77,40 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr)
 	pac->acl_next = NULL;
 	ipv6_addr_copy(&pac->acl_addr, addr);
 
+	rcu_read_lock();
 	if (ifindex == 0) {
 		struct rt6_info *rt;
 
 		rt = rt6_lookup(net, addr, NULL, 0, 0);
 		if (rt) {
 			dev = rt->rt6i_dev;
-			dev_hold(dev);
-			dst_release(&rt->u.dst);
+			dst_release(&rt->dst);
 		} else if (ishost) {
 			err = -EADDRNOTAVAIL;
-			goto out_free_pac;
+			goto error;
 		} else {
 			/* router, no matching interface: just pick one */
-
-			dev = dev_get_by_flags(net, IFF_UP, IFF_UP|IFF_LOOPBACK);
+			dev = dev_get_by_flags_rcu(net, IFF_UP,
+						   IFF_UP | IFF_LOOPBACK);
 		}
 	} else
-		dev = dev_get_by_index(net, ifindex);
+		dev = dev_get_by_index_rcu(net, ifindex);
 
 	if (dev == NULL) {
 		err = -ENODEV;
-		goto out_free_pac;
+		goto error;
 	}
 
-	idev = in6_dev_get(dev);
+	idev = __in6_dev_get(dev);
 	if (!idev) {
 		if (ifindex)
 			err = -ENODEV;
 		else
 			err = -EADDRNOTAVAIL;
-		goto out_dev_put;
+		goto error;
 	}
 	/* reset ishost, now that we have a specific device */
 	ishost = !idev->cnf.forwarding;
-	in6_dev_put(idev);
 
 	pac->acl_ifindex = dev->ifindex;
 
@@ -124,26 +123,22 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr)
 		if (ishost)
 			err = -EADDRNOTAVAIL;
 		if (err)
-			goto out_dev_put;
+			goto error;
 	}
 
 	err = ipv6_dev_ac_inc(dev, addr);
-	if (err)
-		goto out_dev_put;
-
-	write_lock_bh(&ipv6_sk_ac_lock);
-	pac->acl_next = np->ipv6_ac_list;
-	np->ipv6_ac_list = pac;
-	write_unlock_bh(&ipv6_sk_ac_lock);
-
-	dev_put(dev);
-
-	return 0;
+	if (!err) {
+		write_lock_bh(&ipv6_sk_ac_lock);
+		pac->acl_next = np->ipv6_ac_list;
+		np->ipv6_ac_list = pac;
+		write_unlock_bh(&ipv6_sk_ac_lock);
+		pac = NULL;
+	}
 
-out_dev_put:
-	dev_put(dev);
-out_free_pac:
-	sock_kfree_s(sk, pac, sizeof(*pac));
+error:
+	rcu_read_unlock();
+	if (pac)
+		sock_kfree_s(sk, pac, sizeof(*pac));
 	return err;
 }
 
@@ -176,11 +171,12 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, struct in6_addr *addr)
 
 	write_unlock_bh(&ipv6_sk_ac_lock);
 
-	dev = dev_get_by_index(net, pac->acl_ifindex);
-	if (dev) {
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, pac->acl_ifindex);
+	if (dev)
 		ipv6_dev_ac_dec(dev, &pac->acl_addr);
-		dev_put(dev);
-	}
+	rcu_read_unlock();
+
 	sock_kfree_s(sk, pac, sizeof(*pac));
 	return 0;
 }
@@ -199,13 +195,12 @@ void ipv6_sock_ac_close(struct sock *sk)
 	write_unlock_bh(&ipv6_sk_ac_lock);
 
 	prev_index = 0;
+	rcu_read_lock();
 	while (pac) {
 		struct ipv6_ac_socklist *next = pac->acl_next;
 
 		if (pac->acl_ifindex != prev_index) {
-			if (dev)
-				dev_put(dev);
-			dev = dev_get_by_index(net, pac->acl_ifindex);
+			dev = dev_get_by_index_rcu(net, pac->acl_ifindex);
 			prev_index = pac->acl_ifindex;
 		}
 		if (dev)
@@ -213,8 +208,7 @@ void ipv6_sock_ac_close(struct sock *sk)
 		sock_kfree_s(sk, pac, sizeof(*pac));
 		pac = next;
 	}
-	if (dev)
-		dev_put(dev);
+	rcu_read_unlock();
 }
 
 #if 0
@@ -250,7 +244,7 @@ static void aca_put(struct ifacaddr6 *ac)
 {
 	if (atomic_dec_and_test(&ac->aca_refcnt)) {
 		in6_dev_put(ac->aca_idev);
-		dst_release(&ac->aca_rt->u.dst);
+		dst_release(&ac->aca_rt->dst);
 		kfree(ac);
 	}
 }
@@ -356,40 +350,39 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr)
 	write_unlock_bh(&idev->lock);
 	addrconf_leave_solict(idev, &aca->aca_addr);
 
-	dst_hold(&aca->aca_rt->u.dst);
+	dst_hold(&aca->aca_rt->dst);
 	ip6_del_rt(aca->aca_rt);
 
 	aca_put(aca);
 	return 0;
 }
 
+/* called with rcu_read_lock() */
 static int ipv6_dev_ac_dec(struct net_device *dev, struct in6_addr *addr)
 {
-	int ret;
-	struct inet6_dev *idev = in6_dev_get(dev);
+	struct inet6_dev *idev = __in6_dev_get(dev);
+
 	if (idev == NULL)
 		return -ENODEV;
-	ret = __ipv6_dev_ac_dec(idev, addr);
-	in6_dev_put(idev);
-	return ret;
+	return __ipv6_dev_ac_dec(idev, addr);
 }
 
 /*
  *	check if the interface has this anycast address
+ *	called with rcu_read_lock()
  */
 static int ipv6_chk_acast_dev(struct net_device *dev, struct in6_addr *addr)
 {
 	struct inet6_dev *idev;
 	struct ifacaddr6 *aca;
 
-	idev = in6_dev_get(dev);
+	idev = __in6_dev_get(dev);
 	if (idev) {
 		read_lock_bh(&idev->lock);
 		for (aca = idev->ac_list; aca; aca = aca->aca_next)
 			if (ipv6_addr_equal(&aca->aca_addr, addr))
 				break;
 		read_unlock_bh(&idev->lock);
-		in6_dev_put(idev);
 		return aca != NULL;
 	}
 	return 0;
@@ -403,14 +396,15 @@ int ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
 {
 	int found = 0;
 
-	if (dev)
-		return ipv6_chk_acast_dev(dev, addr);
 	rcu_read_lock();
-	for_each_netdev_rcu(net, dev)
-		if (ipv6_chk_acast_dev(dev, addr)) {
-			found = 1;
-			break;
-		}
+	if (dev)
+		found = ipv6_chk_acast_dev(dev, addr);
+	else
+		for_each_netdev_rcu(net, dev)
+			if (ipv6_chk_acast_dev(dev, addr)) {
+				found = 1;
+				break;
+			}
 	rcu_read_unlock();
 	return found;
 }
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 712684687c9..320bdb877ee 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -38,10 +38,11 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	struct sockaddr_in6	*usin = (struct sockaddr_in6 *) uaddr;
 	struct inet_sock      	*inet = inet_sk(sk);
 	struct ipv6_pinfo      	*np = inet6_sk(sk);
-	struct in6_addr		*daddr, *final_p = NULL, final;
+	struct in6_addr		*daddr, *final_p, final;
 	struct dst_entry	*dst;
 	struct flowi		fl;
 	struct ip6_flowlabel	*flowlabel = NULL;
+	struct ipv6_txoptions   *opt;
 	int			addr_type;
 	int			err;
 
@@ -104,9 +105,12 @@ ipv4_connected:
 		if (ipv6_addr_any(&np->saddr))
 			ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
 
-		if (ipv6_addr_any(&np->rcv_saddr))
+		if (ipv6_addr_any(&np->rcv_saddr)) {
 			ipv6_addr_set_v4mapped(inet->inet_rcv_saddr,
 					       &np->rcv_saddr);
+			if (sk->sk_prot->rehash)
+				sk->sk_prot->rehash(sk);
+		}
 
 		goto out;
 	}
@@ -155,19 +159,8 @@ ipv4_connected:
 
 	security_sk_classify_flow(sk, &fl);
 
-	if (flowlabel) {
-		if (flowlabel->opt && flowlabel->opt->srcrt) {
-			struct rt0_hdr *rt0 = (struct rt0_hdr *) flowlabel->opt->srcrt;
-			ipv6_addr_copy(&final, &fl.fl6_dst);
-			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-			final_p = &final;
-		}
-	} else if (np->opt && np->opt->srcrt) {
-		struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
-	}
+	opt = flowlabel ? flowlabel->opt : np->opt;
+	final_p = fl6_update_dst(&fl, opt, &final);
 
 	err = ip6_dst_lookup(sk, &dst, &fl);
 	if (err)
@@ -191,6 +184,8 @@ ipv4_connected:
 	if (ipv6_addr_any(&np->rcv_saddr)) {
 		ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src);
 		inet->inet_rcv_saddr = LOOPBACK4_IPV6;
+		if (sk->sk_prot->rehash)
+			sk->sk_prot->rehash(sk);
 	}
 
 	ip6_dst_store(sk, dst,
@@ -582,6 +577,25 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
 		u8 *ptr = nh + opt->dst1;
 		put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr);
 	}
+	if (np->rxopt.bits.rxorigdstaddr) {
+		struct sockaddr_in6 sin6;
+		u16 *ports = (u16 *) skb_transport_header(skb);
+
+		if (skb_transport_offset(skb) + 4 <= skb->len) {
+			/* All current transport protocols have the port numbers in the
+			 * first four bytes of the transport header and this function is
+			 * written with this assumption in mind.
+			 */
+
+			sin6.sin6_family = AF_INET6;
+			ipv6_addr_copy(&sin6.sin6_addr, &ipv6_hdr(skb)->daddr);
+			sin6.sin6_port = ports[1];
+			sin6.sin6_flowinfo = 0;
+			sin6.sin6_scope_id = 0;
+
+			put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6);
+		}
+	}
 	return 0;
 }
 
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 8a659f92d17..262f105d23b 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -312,6 +312,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
   Routing header.
  ********************************/
 
+/* called with rcu_read_lock() */
 static int ipv6_rthdr_rcv(struct sk_buff *skb)
 {
 	struct inet6_skb_parm *opt = IP6CB(skb);
@@ -324,12 +325,9 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
 	struct net *net = dev_net(skb->dev);
 	int accept_source_route = net->ipv6.devconf_all->accept_source_route;
 
-	idev = in6_dev_get(skb->dev);
-	if (idev) {
-		if (accept_source_route > idev->cnf.accept_source_route)
-			accept_source_route = idev->cnf.accept_source_route;
-		in6_dev_put(idev);
-	}
+	idev = __in6_dev_get(skb->dev);
+	if (idev && accept_source_route > idev->cnf.accept_source_route)
+		accept_source_route = idev->cnf.accept_source_route;
 
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
 	    !pskb_may_pull(skb, (skb_transport_offset(skb) +
@@ -874,3 +872,27 @@ struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
 	return opt;
 }
 
+/**
+ * fl6_update_dst - update flowi destination address with info given
+ *                  by srcrt option, if any.
+ *
+ * @fl: flowi for which fl6_dst is to be updated
+ * @opt: struct ipv6_txoptions in which to look for srcrt opt
+ * @orig: copy of original fl6_dst address if modified
+ *
+ * Returns NULL if no txoptions or no srcrt, otherwise returns orig
+ * and initial value of fl->fl6_dst set in orig
+ */
+struct in6_addr *fl6_update_dst(struct flowi *fl,
+				const struct ipv6_txoptions *opt,
+				struct in6_addr *orig)
+{
+	if (!opt || !opt->srcrt)
+		return NULL;
+
+	ipv6_addr_copy(orig, &fl->fl6_dst);
+	ipv6_addr_copy(&fl->fl6_dst, ((struct rt0_hdr *)opt->srcrt)->addr);
+	return orig;
+}
+
+EXPORT_SYMBOL_GPL(fl6_update_dst);
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index e1caa5d526c..14ed0a955b5 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -13,12 +13,12 @@ int ipv6_ext_hdr(u8 nexthdr)
 	/*
 	 * find out if nexthdr is an extension header or a protocol
 	 */
-	return ( (nexthdr == NEXTHDR_HOP)	||
+	return   (nexthdr == NEXTHDR_HOP)	||
 		 (nexthdr == NEXTHDR_ROUTING)	||
 		 (nexthdr == NEXTHDR_FRAGMENT)	||
 		 (nexthdr == NEXTHDR_AUTH)	||
 		 (nexthdr == NEXTHDR_NONE)	||
-		 (nexthdr == NEXTHDR_DEST) );
+		 (nexthdr == NEXTHDR_DEST);
 }
 
 /*
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 8e44f8f9c18..d829874d894 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -34,17 +34,16 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi *fl,
 {
 	struct fib_lookup_arg arg = {
 		.lookup_ptr = lookup,
+		.flags = FIB_LOOKUP_NOREF,
 	};
 
 	fib_rules_lookup(net->ipv6.fib6_rules_ops, fl, flags, &arg);
-	if (arg.rule)
-		fib_rule_put(arg.rule);
 
 	if (arg.result)
 		return arg.result;
 
-	dst_hold(&net->ipv6.ip6_null_entry->u.dst);
-	return &net->ipv6.ip6_null_entry->u.dst;
+	dst_hold(&net->ipv6.ip6_null_entry->dst);
+	return &net->ipv6.ip6_null_entry->dst;
 }
 
 static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
@@ -86,7 +85,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 			struct in6_addr saddr;
 
 			if (ipv6_dev_get_saddr(net,
-					       ip6_dst_idev(&rt->u.dst)->dev,
+					       ip6_dst_idev(&rt->dst)->dev,
 					       &flp->fl6_dst,
 					       rt6_flags2srcprefs(flags),
 					       &saddr))
@@ -99,12 +98,12 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 		goto out;
 	}
 again:
-	dst_release(&rt->u.dst);
+	dst_release(&rt->dst);
 	rt = NULL;
 	goto out;
 
 discard_pkt:
-	dst_hold(&rt->u.dst);
+	dst_hold(&rt->dst);
 out:
 	arg->result = rt;
 	return rt == NULL ? -EAGAIN : 0;
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 0c5e3c3b7fd..8a1628023bd 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -185,7 +185,7 @@ int inet6_csk_xmit(struct sk_buff *skb)
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct flowi fl;
 	struct dst_entry *dst;
-	struct in6_addr *final_p = NULL, final;
+	struct in6_addr *final_p, final;
 
 	memset(&fl, 0, sizeof(fl));
 	fl.proto = sk->sk_protocol;
@@ -199,12 +199,7 @@ int inet6_csk_xmit(struct sk_buff *skb)
 	fl.fl_ip_dport = inet->inet_dport;
 	security_sk_classify_flow(sk, &fl);
 
-	if (np->opt && np->opt->srcrt) {
-		struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
-	}
+	final_p = fl6_update_dst(&fl, np->opt, &final);
 
 	dst = __inet6_csk_dst_check(sk, np->dst_cookie);
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 92a122b7795..de382114609 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -165,7 +165,7 @@ static __inline__ void node_free(struct fib6_node * fn)
 static __inline__ void rt6_release(struct rt6_info *rt)
 {
 	if (atomic_dec_and_test(&rt->rt6i_ref))
-		dst_free(&rt->u.dst);
+		dst_free(&rt->dst);
 }
 
 static void fib6_link_table(struct net *net, struct fib6_table *tb)
@@ -278,7 +278,7 @@ static int fib6_dump_node(struct fib6_walker_t *w)
 	int res;
 	struct rt6_info *rt;
 
-	for (rt = w->leaf; rt; rt = rt->u.dst.rt6_next) {
+	for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
 		res = rt6_dump_route(rt, w->args);
 		if (res < 0) {
 			/* Frame is full, suspend walking */
@@ -619,7 +619,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 
 	ins = &fn->leaf;
 
-	for (iter = fn->leaf; iter; iter=iter->u.dst.rt6_next) {
+	for (iter = fn->leaf; iter; iter=iter->dst.rt6_next) {
 		/*
 		 *	Search for duplicates
 		 */
@@ -647,7 +647,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 		if (iter->rt6i_metric > rt->rt6i_metric)
 			break;
 
-		ins = &iter->u.dst.rt6_next;
+		ins = &iter->dst.rt6_next;
 	}
 
 	/* Reset round-robin state, if necessary */
@@ -658,7 +658,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 	 *	insert node
 	 */
 
-	rt->u.dst.rt6_next = iter;
+	rt->dst.rt6_next = iter;
 	*ins = rt;
 	rt->rt6i_node = fn;
 	atomic_inc(&rt->rt6i_ref);
@@ -799,7 +799,7 @@ out:
 			atomic_inc(&pn->leaf->rt6i_ref);
 		}
 #endif
-		dst_free(&rt->u.dst);
+		dst_free(&rt->dst);
 	}
 	return err;
 
@@ -810,7 +810,7 @@ out:
 st_failure:
 	if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
 		fib6_repair_tree(info->nl_net, fn);
-	dst_free(&rt->u.dst);
+	dst_free(&rt->dst);
 	return err;
 #endif
 }
@@ -1108,7 +1108,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 	RT6_TRACE("fib6_del_route\n");
 
 	/* Unlink it */
-	*rtp = rt->u.dst.rt6_next;
+	*rtp = rt->dst.rt6_next;
 	rt->rt6i_node = NULL;
 	net->ipv6.rt6_stats->fib_rt_entries--;
 	net->ipv6.rt6_stats->fib_discarded_routes++;
@@ -1122,14 +1122,14 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 	FOR_WALKERS(w) {
 		if (w->state == FWS_C && w->leaf == rt) {
 			RT6_TRACE("walker %p adjusted by delroute\n", w);
-			w->leaf = rt->u.dst.rt6_next;
+			w->leaf = rt->dst.rt6_next;
 			if (w->leaf == NULL)
 				w->state = FWS_U;
 		}
 	}
 	read_unlock(&fib6_walker_lock);
 
-	rt->u.dst.rt6_next = NULL;
+	rt->dst.rt6_next = NULL;
 
 	/* If it was last route, expunge its radix tree node */
 	if (fn->leaf == NULL) {
@@ -1168,7 +1168,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
 	struct rt6_info **rtp;
 
 #if RT6_DEBUG >= 2
-	if (rt->u.dst.obsolete>0) {
+	if (rt->dst.obsolete>0) {
 		WARN_ON(fn != NULL);
 		return -ENOENT;
 	}
@@ -1195,7 +1195,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
 	 *	Walk the leaf entries looking for ourself
 	 */
 
-	for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.dst.rt6_next) {
+	for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) {
 		if (*rtp == rt) {
 			fib6_del_route(fn, rtp, info);
 			return 0;
@@ -1334,7 +1334,7 @@ static int fib6_clean_node(struct fib6_walker_t *w)
 		.nl_net = c->net,
 	};
 
-	for (rt = w->leaf; rt; rt = rt->u.dst.rt6_next) {
+	for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
 		res = c->func(rt, c->arg);
 		if (res < 0) {
 			w->leaf = rt;
@@ -1448,8 +1448,8 @@ static int fib6_age(struct rt6_info *rt, void *arg)
 		}
 		gc_args.more++;
 	} else if (rt->rt6i_flags & RTF_CACHE) {
-		if (atomic_read(&rt->u.dst.__refcnt) == 0 &&
-		    time_after_eq(now, rt->u.dst.lastuse + gc_args.timeout)) {
+		if (atomic_read(&rt->dst.__refcnt) == 0 &&
+		    time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) {
 			RT6_TRACE("aging clone %p\n", rt);
 			return -1;
 		} else if ((rt->rt6i_flags & RTF_GATEWAY) &&
@@ -1500,15 +1500,18 @@ static void fib6_gc_timer_cb(unsigned long arg)
 
 static int __net_init fib6_net_init(struct net *net)
 {
+	size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
+
 	setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net);
 
 	net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
 	if (!net->ipv6.rt6_stats)
 		goto out_timer;
 
-	net->ipv6.fib_table_hash = kcalloc(FIB6_TABLE_HASHSZ,
-					   sizeof(*net->ipv6.fib_table_hash),
-					   GFP_KERNEL);
+	/* Avoid false sharing : Use at least a full cache line */
+	size = max_t(size_t, size, L1_CACHE_BYTES);
+
+	net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
 	if (!net->ipv6.fib_table_hash)
 		goto out_rt6_stats;
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 89425af0684..99157b4cd56 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -637,9 +637,9 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	}
 	mtu -= hlen + sizeof(struct frag_hdr);
 
-	if (skb_has_frags(skb)) {
+	if (skb_has_frag_list(skb)) {
 		int first_len = skb_pagelen(skb);
-		int truesizes = 0;
+		struct sk_buff *frag2;
 
 		if (first_len - hlen > mtu ||
 		    ((first_len - hlen) & 7) ||
@@ -651,18 +651,18 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 			if (frag->len > mtu ||
 			    ((frag->len & 7) && frag->next) ||
 			    skb_headroom(frag) < hlen)
-			    goto slow_path;
+				goto slow_path_clean;
 
 			/* Partially cloned skb? */
 			if (skb_shared(frag))
-				goto slow_path;
+				goto slow_path_clean;
 
 			BUG_ON(frag->sk);
 			if (skb->sk) {
 				frag->sk = skb->sk;
 				frag->destructor = sock_wfree;
-				truesizes += frag->truesize;
 			}
+			skb->truesize -= frag->truesize;
 		}
 
 		err = 0;
@@ -693,12 +693,11 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 
 		first_len = skb_pagelen(skb);
 		skb->data_len = first_len - skb_headlen(skb);
-		skb->truesize -= truesizes;
 		skb->len = first_len;
 		ipv6_hdr(skb)->payload_len = htons(first_len -
 						   sizeof(struct ipv6hdr));
 
-		dst_hold(&rt->u.dst);
+		dst_hold(&rt->dst);
 
 		for (;;) {
 			/* Prepare header of the next frame,
@@ -726,7 +725,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 
 			err = output(skb);
 			if(!err)
-				IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
+				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 					      IPSTATS_MIB_FRAGCREATES);
 
 			if (err || !frag)
@@ -740,9 +739,9 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 		kfree(tmp_hdr);
 
 		if (err == 0) {
-			IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
+			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 				      IPSTATS_MIB_FRAGOKS);
-			dst_release(&rt->u.dst);
+			dst_release(&rt->dst);
 			return 0;
 		}
 
@@ -752,10 +751,19 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 			frag = skb;
 		}
 
-		IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
+		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 			      IPSTATS_MIB_FRAGFAILS);
-		dst_release(&rt->u.dst);
+		dst_release(&rt->dst);
 		return err;
+
+slow_path_clean:
+		skb_walk_frags(skb, frag2) {
+			if (frag2 == frag)
+				break;
+			frag2->sk = NULL;
+			frag2->destructor = NULL;
+			skb->truesize += frag2->truesize;
+		}
 	}
 
 slow_path:
@@ -785,7 +793,7 @@ slow_path:
 		 *	Allocate buffer.
 		 */
 
-		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
+		if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 				      IPSTATS_MIB_FRAGFAILS);
@@ -798,7 +806,7 @@ slow_path:
 		 */
 
 		ip6_copy_metadata(frag, skb);
-		skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
+		skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 		skb_reset_network_header(frag);
 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
@@ -870,8 +878,8 @@ static inline int ip6_rt_check(struct rt6key *rt_key,
 			       struct in6_addr *fl_addr,
 			       struct in6_addr *addr_cache)
 {
-	return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
-		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
+	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
+		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 }
 
 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
@@ -1156,24 +1164,24 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 
 			/* need source address above miyazawa*/
 		}
-		dst_hold(&rt->u.dst);
-		inet->cork.dst = &rt->u.dst;
+		dst_hold(&rt->dst);
+		inet->cork.dst = &rt->dst;
 		inet->cork.fl = *fl;
 		np->cork.hop_limit = hlimit;
 		np->cork.tclass = tclass;
 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
-		      rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
+		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
 		if (np->frag_size < mtu) {
 			if (np->frag_size)
 				mtu = np->frag_size;
 		}
 		inet->cork.fragsize = mtu;
-		if (dst_allfrag(rt->u.dst.path))
+		if (dst_allfrag(rt->dst.path))
 			inet->cork.flags |= IPCORK_ALLFRAG;
 		inet->cork.length = 0;
 		sk->sk_sndmsg_page = NULL;
 		sk->sk_sndmsg_off = 0;
-		exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
+		exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
 			    rt->rt6i_nfheader_len;
 		length += exthdrlen;
 		transhdrlen += exthdrlen;
@@ -1186,7 +1194,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 		mtu = inet->cork.fragsize;
 	}
 
-	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
 
 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
 			(opt ? opt->opt_nflen : 0);
@@ -1224,7 +1232,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 		}
 
 		if (proto == IPPROTO_UDP &&
-		    (rt->u.dst.dev->features & NETIF_F_UFO)) {
+		    (rt->dst.dev->features & NETIF_F_UFO)) {
 
 			err = ip6_ufo_append_data(sk, getfrag, from, length,
 						  hh_len, fragheaderlen,
@@ -1270,7 +1278,7 @@ alloc_new_skb:
 
 			fraglen = datalen + fragheaderlen;
 			if ((flags & MSG_MORE) &&
-			    !(rt->u.dst.dev->features&NETIF_F_SG))
+			    !(rt->dst.dev->features&NETIF_F_SG))
 				alloclen = mtu;
 			else
 				alloclen = datalen + fragheaderlen;
@@ -1281,7 +1289,7 @@ alloc_new_skb:
 			 * because we have no idea if we're the last one.
 			 */
 			if (datalen == length + fraggap)
-				alloclen += rt->u.dst.trailer_len;
+				alloclen += rt->dst.trailer_len;
 
 			/*
 			 * We just reserve space for fragment header.
@@ -1358,7 +1366,7 @@ alloc_new_skb:
 		if (copy > length)
 			copy = length;
 
-		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
+		if (!(rt->dst.dev->features&NETIF_F_SG)) {
 			unsigned int off;
 
 			off = skb->len;
@@ -1503,7 +1511,7 @@ int ip6_push_pending_frames(struct sock *sk)
 	skb->priority = sk->sk_priority;
 	skb->mark = sk->sk_mark;
 
-	skb_dst_set(skb, dst_clone(&rt->u.dst));
+	skb_dst_set(skb, dst_clone(&rt->dst));
 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
 	if (proto == IPPROTO_ICMPV6) {
 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 8f39893d808..2a59610c2a5 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -75,7 +75,7 @@ MODULE_LICENSE("GPL");
 		     (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \
 		    (HASH_SIZE - 1))
 
-static void ip6_tnl_dev_init(struct net_device *dev);
+static int ip6_tnl_dev_init(struct net_device *dev);
 static void ip6_tnl_dev_setup(struct net_device *dev);
 
 static int ip6_tnl_net_id __read_mostly;
@@ -83,15 +83,42 @@ struct ip6_tnl_net {
 	/* the IPv6 tunnel fallback device */
 	struct net_device *fb_tnl_dev;
 	/* lists for storing tunnels in use */
-	struct ip6_tnl *tnls_r_l[HASH_SIZE];
-	struct ip6_tnl *tnls_wc[1];
-	struct ip6_tnl **tnls[2];
+	struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE];
+	struct ip6_tnl __rcu *tnls_wc[1];
+	struct ip6_tnl __rcu **tnls[2];
 };
 
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+	unsigned long	rx_packets;
+	unsigned long	rx_bytes;
+	unsigned long	tx_packets;
+	unsigned long	tx_bytes;
+};
+
+static struct net_device_stats *ip6_get_stats(struct net_device *dev)
+{
+	struct pcpu_tstats sum = { 0 };
+	int i;
+
+	for_each_possible_cpu(i) {
+		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+		sum.rx_packets += tstats->rx_packets;
+		sum.rx_bytes   += tstats->rx_bytes;
+		sum.tx_packets += tstats->tx_packets;
+		sum.tx_bytes   += tstats->tx_bytes;
+	}
+	dev->stats.rx_packets = sum.rx_packets;
+	dev->stats.rx_bytes   = sum.rx_bytes;
+	dev->stats.tx_packets = sum.tx_packets;
+	dev->stats.tx_bytes   = sum.tx_bytes;
+	return &dev->stats;
+}
+
 /*
- * Locking : hash tables are protected by RCU and a spinlock
+ * Locking : hash tables are protected by RCU and RTNL
  */
-static DEFINE_SPINLOCK(ip6_tnl_lock);
 
 static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t)
 {
@@ -138,8 +165,8 @@ static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst)
 static struct ip6_tnl *
 ip6_tnl_lookup(struct net *net, struct in6_addr *remote, struct in6_addr *local)
 {
-	unsigned h0 = HASH(remote);
-	unsigned h1 = HASH(local);
+	unsigned int h0 = HASH(remote);
+	unsigned int h1 = HASH(local);
 	struct ip6_tnl *t;
 	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
 
@@ -167,7 +194,7 @@ ip6_tnl_lookup(struct net *net, struct in6_addr *remote, struct in6_addr *local)
  * Return: head of IPv6 tunnel list
  **/
 
-static struct ip6_tnl **
+static struct ip6_tnl __rcu **
 ip6_tnl_bucket(struct ip6_tnl_net *ip6n, struct ip6_tnl_parm *p)
 {
 	struct in6_addr *remote = &p->raddr;
@@ -190,12 +217,10 @@ ip6_tnl_bucket(struct ip6_tnl_net *ip6n, struct ip6_tnl_parm *p)
 static void
 ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
 {
-	struct ip6_tnl **tp = ip6_tnl_bucket(ip6n, &t->parms);
+	struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
 
-	spin_lock_bh(&ip6_tnl_lock);
-	t->next = *tp;
+	rcu_assign_pointer(t->next , rtnl_dereference(*tp));
 	rcu_assign_pointer(*tp, t);
-	spin_unlock_bh(&ip6_tnl_lock);
 }
 
 /**
@@ -206,18 +231,25 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
 static void
 ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
 {
-	struct ip6_tnl **tp;
-
-	for (tp = ip6_tnl_bucket(ip6n, &t->parms); *tp; tp = &(*tp)->next) {
-		if (t == *tp) {
-			spin_lock_bh(&ip6_tnl_lock);
-			*tp = t->next;
-			spin_unlock_bh(&ip6_tnl_lock);
+	struct ip6_tnl __rcu **tp;
+	struct ip6_tnl *iter;
+
+	for (tp = ip6_tnl_bucket(ip6n, &t->parms);
+	     (iter = rtnl_dereference(*tp)) != NULL;
+	     tp = &iter->next) {
+		if (t == iter) {
+			rcu_assign_pointer(*tp, t->next);
 			break;
 		}
 	}
 }
 
+static void ip6_dev_free(struct net_device *dev)
+{
+	free_percpu(dev->tstats);
+	free_netdev(dev);
+}
+
 /**
  * ip6_tnl_create() - create a new tunnel
  *   @p: tunnel parameters
@@ -256,7 +288,9 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p)
 
 	t = netdev_priv(dev);
 	t->parms = *p;
-	ip6_tnl_dev_init(dev);
+	err = ip6_tnl_dev_init(dev);
+	if (err < 0)
+		goto failed_free;
 
 	if ((err = register_netdevice(dev)) < 0)
 		goto failed_free;
@@ -266,7 +300,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p)
 	return t;
 
 failed_free:
-	free_netdev(dev);
+	ip6_dev_free(dev);
 failed:
 	return NULL;
 }
@@ -290,10 +324,13 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net,
 {
 	struct in6_addr *remote = &p->raddr;
 	struct in6_addr *local = &p->laddr;
+	struct ip6_tnl __rcu **tp;
 	struct ip6_tnl *t;
 	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
 
-	for (t = *ip6_tnl_bucket(ip6n, p); t; t = t->next) {
+	for (tp = ip6_tnl_bucket(ip6n, p);
+	     (t = rtnl_dereference(*tp)) != NULL;
+	     tp = &t->next) {
 		if (ipv6_addr_equal(local, &t->parms.laddr) &&
 		    ipv6_addr_equal(remote, &t->parms.raddr))
 			return t;
@@ -318,13 +355,10 @@ ip6_tnl_dev_uninit(struct net_device *dev)
 	struct net *net = dev_net(dev);
 	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
 
-	if (dev == ip6n->fb_tnl_dev) {
-		spin_lock_bh(&ip6_tnl_lock);
-		ip6n->tnls_wc[0] = NULL;
-		spin_unlock_bh(&ip6_tnl_lock);
-	} else {
+	if (dev == ip6n->fb_tnl_dev)
+		rcu_assign_pointer(ip6n->tnls_wc[0], NULL);
+	else
 		ip6_tnl_unlink(ip6n, t);
-	}
 	ip6_tnl_dst_reset(t);
 	dev_put(dev);
 }
@@ -552,7 +586,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (ip_route_output_key(dev_net(skb->dev), &rt, &fl))
 		goto out;
 
-	skb2->dev = rt->u.dst.dev;
+	skb2->dev = rt->dst.dev;
 
 	/* route "incoming" packet */
 	if (rt->rt_flags & RTCF_LOCAL) {
@@ -562,7 +596,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		fl.fl4_src = eiph->saddr;
 		fl.fl4_tos = eiph->tos;
 		if (ip_route_output_key(dev_net(skb->dev), &rt, &fl) ||
-		    rt->u.dst.dev->type != ARPHRD_TUNNEL) {
+		    rt->dst.dev->type != ARPHRD_TUNNEL) {
 			ip_rt_put(rt);
 			goto out;
 		}
@@ -626,7 +660,7 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		icmpv6_send(skb2, rel_type, rel_code, rel_info);
 
 		if (rt)
-			dst_release(&rt->u.dst);
+			dst_release(&rt->dst);
 
 		kfree_skb(skb2);
 	}
@@ -702,6 +736,8 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
 
 	if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr,
 					&ipv6h->daddr)) != NULL) {
+		struct pcpu_tstats *tstats;
+
 		if (t->parms.proto != ipproto && t->parms.proto != 0) {
 			rcu_read_unlock();
 			goto discard;
@@ -724,10 +760,16 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
 		skb->pkt_type = PACKET_HOST;
 		memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
 
-		skb_tunnel_rx(skb, t->dev);
+		tstats = this_cpu_ptr(t->dev->tstats);
+		tstats->rx_packets++;
+		tstats->rx_bytes += skb->len;
+
+		__skb_tunnel_rx(skb, t->dev);
 
 		dscp_ecn_decapsulate(t, ipv6h, skb);
+
 		netif_rx(skb);
+
 		rcu_read_unlock();
 		return 0;
 	}
@@ -934,8 +976,10 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 	err = ip6_local_out(skb);
 
 	if (net_xmit_eval(err) == 0) {
-		stats->tx_bytes += pkt_len;
-		stats->tx_packets++;
+		struct pcpu_tstats *tstats = this_cpu_ptr(t->dev->tstats);
+
+		tstats->tx_bytes += pkt_len;
+		tstats->tx_packets++;
 	} else {
 		stats->tx_errors++;
 		stats->tx_aborted_errors++;
@@ -1135,7 +1179,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
 			if (dev->mtu < IPV6_MIN_MTU)
 				dev->mtu = IPV6_MIN_MTU;
 		}
-		dst_release(&rt->u.dst);
+		dst_release(&rt->dst);
 	}
 }
 
@@ -1240,6 +1284,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 				t = netdev_priv(dev);
 
 			ip6_tnl_unlink(ip6n, t);
+			synchronize_net();
 			err = ip6_tnl_change(t, &p);
 			ip6_tnl_link(ip6n, t);
 			netdev_state_change(dev);
@@ -1300,12 +1345,14 @@ ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
 
 
 static const struct net_device_ops ip6_tnl_netdev_ops = {
-	.ndo_uninit = ip6_tnl_dev_uninit,
+	.ndo_uninit	= ip6_tnl_dev_uninit,
 	.ndo_start_xmit = ip6_tnl_xmit,
-	.ndo_do_ioctl = ip6_tnl_ioctl,
+	.ndo_do_ioctl	= ip6_tnl_ioctl,
 	.ndo_change_mtu = ip6_tnl_change_mtu,
+	.ndo_get_stats	= ip6_get_stats,
 };
 
+
 /**
  * ip6_tnl_dev_setup - setup virtual tunnel device
  *   @dev: virtual device associated with tunnel
@@ -1317,7 +1364,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
 static void ip6_tnl_dev_setup(struct net_device *dev)
 {
 	dev->netdev_ops = &ip6_tnl_netdev_ops;
-	dev->destructor = free_netdev;
+	dev->destructor = ip6_dev_free;
 
 	dev->type = ARPHRD_TUNNEL6;
 	dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr);
@@ -1325,6 +1372,7 @@ static void ip6_tnl_dev_setup(struct net_device *dev)
 	dev->flags |= IFF_NOARP;
 	dev->addr_len = sizeof(struct in6_addr);
 	dev->features |= NETIF_F_NETNS_LOCAL;
+	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
 }
 
 
@@ -1333,12 +1381,17 @@ static void ip6_tnl_dev_setup(struct net_device *dev)
  *   @dev: virtual device associated with tunnel
  **/
 
-static inline void
+static inline int
 ip6_tnl_dev_init_gen(struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
+
 	t->dev = dev;
 	strcpy(t->parms.name, dev->name);
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+	return 0;
 }
 
 /**
@@ -1346,11 +1399,15 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
  *   @dev: virtual device associated with tunnel
  **/
 
-static void ip6_tnl_dev_init(struct net_device *dev)
+static int ip6_tnl_dev_init(struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
-	ip6_tnl_dev_init_gen(dev);
+	int err = ip6_tnl_dev_init_gen(dev);
+
+	if (err)
+		return err;
 	ip6_tnl_link_config(t);
+	return 0;
 }
 
 /**
@@ -1360,25 +1417,29 @@ static void ip6_tnl_dev_init(struct net_device *dev)
  * Return: 0
  **/
 
-static void __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
+static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
 	struct net *net = dev_net(dev);
 	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+	int err = ip6_tnl_dev_init_gen(dev);
+
+	if (err)
+		return err;
 
-	ip6_tnl_dev_init_gen(dev);
 	t->parms.proto = IPPROTO_IPV6;
 	dev_hold(dev);
-	ip6n->tnls_wc[0] = t;
+	rcu_assign_pointer(ip6n->tnls_wc[0], t);
+	return 0;
 }
 
-static struct xfrm6_tunnel ip4ip6_handler = {
+static struct xfrm6_tunnel ip4ip6_handler __read_mostly = {
 	.handler	= ip4ip6_rcv,
 	.err_handler	= ip4ip6_err,
 	.priority	=	1,
 };
 
-static struct xfrm6_tunnel ip6ip6_handler = {
+static struct xfrm6_tunnel ip6ip6_handler __read_mostly = {
 	.handler	= ip6ip6_rcv,
 	.err_handler	= ip6ip6_err,
 	.priority	=	1,
@@ -1391,14 +1452,14 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct ip6_tnl_net *ip6n)
 	LIST_HEAD(list);
 
 	for (h = 0; h < HASH_SIZE; h++) {
-		t = ip6n->tnls_r_l[h];
+		t = rtnl_dereference(ip6n->tnls_r_l[h]);
 		while (t != NULL) {
 			unregister_netdevice_queue(t->dev, &list);
-			t = t->next;
+			t = rtnl_dereference(t->next);
 		}
 	}
 
-	t = ip6n->tnls_wc[0];
+	t = rtnl_dereference(ip6n->tnls_wc[0]);
 	unregister_netdevice_queue(t->dev, &list);
 	unregister_netdevice_many(&list);
 }
@@ -1419,7 +1480,9 @@ static int __net_init ip6_tnl_init_net(struct net *net)
 		goto err_alloc_dev;
 	dev_net_set(ip6n->fb_tnl_dev, net);
 
-	ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
+	err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
+	if (err < 0)
+		goto err_register;
 
 	err = register_netdev(ip6n->fb_tnl_dev);
 	if (err < 0)
@@ -1427,7 +1490,7 @@ static int __net_init ip6_tnl_init_net(struct net *net)
 	return 0;
 
 err_register:
-	free_netdev(ip6n->fb_tnl_dev);
+	ip6_dev_free(ip6n->fb_tnl_dev);
 err_alloc_dev:
 	return err;
 }
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 66078dad7fe..6f32ffce702 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -667,6 +667,7 @@ static int pim6_rcv(struct sk_buff *skb)
 	skb_tunnel_rx(skb, reg_dev);
 
 	netif_rx(skb);
+
 	dev_put(reg_dev);
 	return 0;
  drop:
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index bd43f0152c2..d1770e061c0 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -55,8 +55,6 @@
 
 #include <asm/uaccess.h>
 
-DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics) __read_mostly;
-
 struct ip6_ra_chain *ip6_ra_chain;
 DEFINE_RWLOCK(ip6_ra_lock);
 
@@ -344,6 +342,25 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		retv = 0;
 		break;
 
+	case IPV6_TRANSPARENT:
+		if (!capable(CAP_NET_ADMIN)) {
+			retv = -EPERM;
+			break;
+		}
+		if (optlen < sizeof(int))
+			goto e_inval;
+		/* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */
+		inet_sk(sk)->transparent = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_RECVORIGDSTADDR:
+		if (optlen < sizeof(int))
+			goto e_inval;
+		np->rxopt.bits.rxorigdstaddr = valbool;
+		retv = 0;
+		break;
+
 	case IPV6_HOPOPTS:
 	case IPV6_RTHDRDSTOPTS:
 	case IPV6_RTHDR:
@@ -1106,6 +1123,14 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 	}
 
+	case IPV6_TRANSPARENT:
+		val = inet_sk(sk)->transparent;
+		break;
+
+	case IPV6_RECVORIGDSTADDR:
+		val = np->rxopt.bits.rxorigdstaddr;
+		break;
+
 	case IPV6_UNICAST_HOPS:
 	case IPV6_MULTICAST_HOPS:
 	{
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index ab1622d7d40..d1444b95ad7 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -152,18 +152,19 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 	mc_lst->next = NULL;
 	ipv6_addr_copy(&mc_lst->addr, addr);
 
+	rcu_read_lock();
 	if (ifindex == 0) {
 		struct rt6_info *rt;
 		rt = rt6_lookup(net, addr, NULL, 0, 0);
 		if (rt) {
 			dev = rt->rt6i_dev;
-			dev_hold(dev);
-			dst_release(&rt->u.dst);
+			dst_release(&rt->dst);
 		}
 	} else
-		dev = dev_get_by_index(net, ifindex);
+		dev = dev_get_by_index_rcu(net, ifindex);
 
 	if (dev == NULL) {
+		rcu_read_unlock();
 		sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
 		return -ENODEV;
 	}
@@ -180,8 +181,8 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 	err = ipv6_dev_mc_inc(dev, addr);
 
 	if (err) {
+		rcu_read_unlock();
 		sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
-		dev_put(dev);
 		return err;
 	}
 
@@ -190,7 +191,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 	np->ipv6_mc_list = mc_lst;
 	write_unlock_bh(&ipv6_sk_mc_lock);
 
-	dev_put(dev);
+	rcu_read_unlock();
 
 	return 0;
 }
@@ -213,18 +214,17 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
 			*lnk = mc_lst->next;
 			write_unlock_bh(&ipv6_sk_mc_lock);
 
-			dev = dev_get_by_index(net, mc_lst->ifindex);
+			rcu_read_lock();
+			dev = dev_get_by_index_rcu(net, mc_lst->ifindex);
 			if (dev != NULL) {
-				struct inet6_dev *idev = in6_dev_get(dev);
+				struct inet6_dev *idev = __in6_dev_get(dev);
 
 				(void) ip6_mc_leave_src(sk, mc_lst, idev);
-				if (idev) {
+				if (idev)
 					__ipv6_dev_mc_dec(idev, &mc_lst->addr);
-					in6_dev_put(idev);
-				}
-				dev_put(dev);
 			} else
 				(void) ip6_mc_leave_src(sk, mc_lst, NULL);
+			rcu_read_unlock();
 			sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
 			return 0;
 		}
@@ -234,43 +234,36 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
 	return -EADDRNOTAVAIL;
 }
 
-static struct inet6_dev *ip6_mc_find_dev(struct net *net,
-					 struct in6_addr *group,
-					 int ifindex)
+/* called with rcu_read_lock() */
+static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net,
+					     struct in6_addr *group,
+					     int ifindex)
 {
 	struct net_device *dev = NULL;
 	struct inet6_dev *idev = NULL;
 
 	if (ifindex == 0) {
-		struct rt6_info *rt;
+		struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, 0);
 
-		rt = rt6_lookup(net, group, NULL, 0, 0);
 		if (rt) {
 			dev = rt->rt6i_dev;
 			dev_hold(dev);
-			dst_release(&rt->u.dst);
+			dst_release(&rt->dst);
 		}
 	} else
-		dev = dev_get_by_index(net, ifindex);
+		dev = dev_get_by_index_rcu(net, ifindex);
 
 	if (!dev)
-		goto nodev;
-	idev = in6_dev_get(dev);
+		return NULL;
+	idev = __in6_dev_get(dev);
 	if (!idev)
-		goto release;
+		return NULL;;
 	read_lock_bh(&idev->lock);
-	if (idev->dead)
-		goto unlock_release;
-
+	if (idev->dead) {
+		read_unlock_bh(&idev->lock);
+		return NULL;
+	}
 	return idev;
-
-unlock_release:
-	read_unlock_bh(&idev->lock);
-	in6_dev_put(idev);
-release:
-	dev_put(dev);
-nodev:
-	return NULL;
 }
 
 void ipv6_sock_mc_close(struct sock *sk)
@@ -286,19 +279,17 @@ void ipv6_sock_mc_close(struct sock *sk)
 		np->ipv6_mc_list = mc_lst->next;
 		write_unlock_bh(&ipv6_sk_mc_lock);
 
-		dev = dev_get_by_index(net, mc_lst->ifindex);
+		rcu_read_lock();
+		dev = dev_get_by_index_rcu(net, mc_lst->ifindex);
 		if (dev) {
-			struct inet6_dev *idev = in6_dev_get(dev);
+			struct inet6_dev *idev = __in6_dev_get(dev);
 
 			(void) ip6_mc_leave_src(sk, mc_lst, idev);
-			if (idev) {
+			if (idev)
 				__ipv6_dev_mc_dec(idev, &mc_lst->addr);
-				in6_dev_put(idev);
-			}
-			dev_put(dev);
 		} else
 			(void) ip6_mc_leave_src(sk, mc_lst, NULL);
-
+		rcu_read_unlock();
 		sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
 
 		write_lock_bh(&ipv6_sk_mc_lock);
@@ -327,14 +318,17 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
 	if (!ipv6_addr_is_multicast(group))
 		return -EINVAL;
 
-	idev = ip6_mc_find_dev(net, group, pgsr->gsr_interface);
-	if (!idev)
+	rcu_read_lock();
+	idev = ip6_mc_find_dev_rcu(net, group, pgsr->gsr_interface);
+	if (!idev) {
+		rcu_read_unlock();
 		return -ENODEV;
+	}
 	dev = idev->dev;
 
 	err = -EADDRNOTAVAIL;
 
-	read_lock_bh(&ipv6_sk_mc_lock);
+	read_lock(&ipv6_sk_mc_lock);
 	for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
 		if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface)
 			continue;
@@ -358,7 +352,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
 		pmc->sfmode = omode;
 	}
 
-	write_lock_bh(&pmc->sflock);
+	write_lock(&pmc->sflock);
 	pmclocked = 1;
 
 	psl = pmc->sflist;
@@ -433,11 +427,10 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
 	ip6_mc_add_src(idev, group, omode, 1, source, 1);
 done:
 	if (pmclocked)
-		write_unlock_bh(&pmc->sflock);
-	read_unlock_bh(&ipv6_sk_mc_lock);
+		write_unlock(&pmc->sflock);
+	read_unlock(&ipv6_sk_mc_lock);
 	read_unlock_bh(&idev->lock);
-	in6_dev_put(idev);
-	dev_put(dev);
+	rcu_read_unlock();
 	if (leavegroup)
 		return ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group);
 	return err;
@@ -463,14 +456,17 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
 	    gsf->gf_fmode != MCAST_EXCLUDE)
 		return -EINVAL;
 
-	idev = ip6_mc_find_dev(net, group, gsf->gf_interface);
+	rcu_read_lock();
+	idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface);
 
-	if (!idev)
+	if (!idev) {
+		rcu_read_unlock();
 		return -ENODEV;
+	}
 	dev = idev->dev;
 
 	err = 0;
-	read_lock_bh(&ipv6_sk_mc_lock);
+	read_lock(&ipv6_sk_mc_lock);
 
 	if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) {
 		leavegroup = 1;
@@ -512,7 +508,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
 		(void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0);
 	}
 
-	write_lock_bh(&pmc->sflock);
+	write_lock(&pmc->sflock);
 	psl = pmc->sflist;
 	if (psl) {
 		(void) ip6_mc_del_src(idev, group, pmc->sfmode,
@@ -522,13 +518,12 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
 		(void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
 	pmc->sflist = newpsl;
 	pmc->sfmode = gsf->gf_fmode;
-	write_unlock_bh(&pmc->sflock);
+	write_unlock(&pmc->sflock);
 	err = 0;
 done:
-	read_unlock_bh(&ipv6_sk_mc_lock);
+	read_unlock(&ipv6_sk_mc_lock);
 	read_unlock_bh(&idev->lock);
-	in6_dev_put(idev);
-	dev_put(dev);
+	rcu_read_unlock();
 	if (leavegroup)
 		err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group);
 	return err;
@@ -551,11 +546,13 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
 	if (!ipv6_addr_is_multicast(group))
 		return -EINVAL;
 
-	idev = ip6_mc_find_dev(net, group, gsf->gf_interface);
+	rcu_read_lock();
+	idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface);
 
-	if (!idev)
+	if (!idev) {
+		rcu_read_unlock();
 		return -ENODEV;
-
+	}
 	dev = idev->dev;
 
 	err = -EADDRNOTAVAIL;
@@ -577,8 +574,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
 	psl = pmc->sflist;
 	count = psl ? psl->sl_count : 0;
 	read_unlock_bh(&idev->lock);
-	in6_dev_put(idev);
-	dev_put(dev);
+	rcu_read_unlock();
 
 	copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
 	gsf->gf_numsrc = count;
@@ -604,8 +600,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
 	return 0;
 done:
 	read_unlock_bh(&idev->lock);
-	in6_dev_put(idev);
-	dev_put(dev);
+	rcu_read_unlock();
 	return err;
 }
 
@@ -822,6 +817,7 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
 	struct ifmcaddr6 *mc;
 	struct inet6_dev *idev;
 
+	/* we need to take a reference on idev */
 	idev = in6_dev_get(dev);
 
 	if (idev == NULL)
@@ -860,7 +856,7 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
 	setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc);
 
 	ipv6_addr_copy(&mc->mca_addr, addr);
-	mc->idev = idev;
+	mc->idev = idev; /* (reference taken) */
 	mc->mca_users = 1;
 	/* mca_stamp should be updated upon changes */
 	mc->mca_cstamp = mc->mca_tstamp = jiffies;
@@ -915,16 +911,18 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr)
 
 int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr)
 {
-	struct inet6_dev *idev = in6_dev_get(dev);
+	struct inet6_dev *idev;
 	int err;
 
-	if (!idev)
-		return -ENODEV;
-
-	err = __ipv6_dev_mc_dec(idev, addr);
+	rcu_read_lock();
 
-	in6_dev_put(idev);
+	idev = __in6_dev_get(dev);
+	if (!idev)
+		err = -ENODEV;
+	else
+		err = __ipv6_dev_mc_dec(idev, addr);
 
+	rcu_read_unlock();
 	return err;
 }
 
@@ -965,7 +963,8 @@ int ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
 	struct ifmcaddr6 *mc;
 	int rv = 0;
 
-	idev = in6_dev_get(dev);
+	rcu_read_lock();
+	idev = __in6_dev_get(dev);
 	if (idev) {
 		read_lock_bh(&idev->lock);
 		for (mc = idev->mc_list; mc; mc=mc->next) {
@@ -992,8 +991,8 @@ int ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
 				rv = 1; /* don't filter unspecified source */
 		}
 		read_unlock_bh(&idev->lock);
-		in6_dev_put(idev);
 	}
+	rcu_read_unlock();
 	return rv;
 }
 
@@ -1104,6 +1103,7 @@ static int mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
 	return 1;
 }
 
+/* called with rcu_read_lock() */
 int igmp6_event_query(struct sk_buff *skb)
 {
 	struct mld2_query *mlh2 = NULL;
@@ -1127,7 +1127,7 @@ int igmp6_event_query(struct sk_buff *skb)
 	if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL))
 		return -EINVAL;
 
-	idev = in6_dev_get(skb->dev);
+	idev = __in6_dev_get(skb->dev);
 
 	if (idev == NULL)
 		return 0;
@@ -1137,10 +1137,8 @@ int igmp6_event_query(struct sk_buff *skb)
 	group_type = ipv6_addr_type(group);
 
 	if (group_type != IPV6_ADDR_ANY &&
-	    !(group_type&IPV6_ADDR_MULTICAST)) {
-		in6_dev_put(idev);
+	    !(group_type&IPV6_ADDR_MULTICAST))
 		return -EINVAL;
-	}
 
 	if (len == 24) {
 		int switchback;
@@ -1161,10 +1159,9 @@ int igmp6_event_query(struct sk_buff *skb)
 	} else if (len >= 28) {
 		int srcs_offset = sizeof(struct mld2_query) -
 				  sizeof(struct icmp6hdr);
-		if (!pskb_may_pull(skb, srcs_offset)) {
-			in6_dev_put(idev);
+		if (!pskb_may_pull(skb, srcs_offset))
 			return -EINVAL;
-		}
+
 		mlh2 = (struct mld2_query *)skb_transport_header(skb);
 		max_delay = (MLDV2_MRC(ntohs(mlh2->mld2q_mrc))*HZ)/1000;
 		if (!max_delay)
@@ -1173,28 +1170,23 @@ int igmp6_event_query(struct sk_buff *skb)
 		if (mlh2->mld2q_qrv)
 			idev->mc_qrv = mlh2->mld2q_qrv;
 		if (group_type == IPV6_ADDR_ANY) { /* general query */
-			if (mlh2->mld2q_nsrcs) {
-				in6_dev_put(idev);
+			if (mlh2->mld2q_nsrcs)
 				return -EINVAL; /* no sources allowed */
-			}
+
 			mld_gq_start_timer(idev);
-			in6_dev_put(idev);
 			return 0;
 		}
 		/* mark sources to include, if group & source-specific */
 		if (mlh2->mld2q_nsrcs != 0) {
 			if (!pskb_may_pull(skb, srcs_offset +
-			    ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr))) {
-				in6_dev_put(idev);
+			    ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr)))
 				return -EINVAL;
-			}
+
 			mlh2 = (struct mld2_query *)skb_transport_header(skb);
 			mark = 1;
 		}
-	} else {
-		in6_dev_put(idev);
+	} else
 		return -EINVAL;
-	}
 
 	read_lock_bh(&idev->lock);
 	if (group_type == IPV6_ADDR_ANY) {
@@ -1227,12 +1219,11 @@ int igmp6_event_query(struct sk_buff *skb)
 		}
 	}
 	read_unlock_bh(&idev->lock);
-	in6_dev_put(idev);
 
 	return 0;
 }
 
-
+/* called with rcu_read_lock() */
 int igmp6_event_report(struct sk_buff *skb)
 {
 	struct ifmcaddr6 *ma;
@@ -1260,7 +1251,7 @@ int igmp6_event_report(struct sk_buff *skb)
 	    !(addr_type&IPV6_ADDR_LINKLOCAL))
 		return -EINVAL;
 
-	idev = in6_dev_get(skb->dev);
+	idev = __in6_dev_get(skb->dev);
 	if (idev == NULL)
 		return -ENODEV;
 
@@ -1280,7 +1271,6 @@ int igmp6_event_report(struct sk_buff *skb)
 		}
 	}
 	read_unlock_bh(&idev->lock);
-	in6_dev_put(idev);
 	return 0;
 }
 
@@ -1396,12 +1386,14 @@ static void mld_sendpack(struct sk_buff *skb)
 	struct mld2_report *pmr =
 			      (struct mld2_report *)skb_transport_header(skb);
 	int payload_len, mldlen;
-	struct inet6_dev *idev = in6_dev_get(skb->dev);
+	struct inet6_dev *idev;
 	struct net *net = dev_net(skb->dev);
 	int err;
 	struct flowi fl;
 	struct dst_entry *dst;
 
+	rcu_read_lock();
+	idev = __in6_dev_get(skb->dev);
 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
 
 	payload_len = (skb->tail - skb->network_header) - sizeof(*pip6);
@@ -1441,8 +1433,7 @@ out:
 	} else
 		IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_OUTDISCARDS);
 
-	if (likely(idev != NULL))
-		in6_dev_put(idev);
+	rcu_read_unlock();
 	return;
 
 err_out:
@@ -1779,7 +1770,8 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
 					 IPPROTO_ICMPV6,
 					 csum_partial(hdr, len, 0));
 
-	idev = in6_dev_get(skb->dev);
+	rcu_read_lock();
+	idev = __in6_dev_get(skb->dev);
 
 	dst = icmp6_dst_alloc(skb->dev, NULL, &ipv6_hdr(skb)->daddr);
 	if (!dst) {
@@ -1806,8 +1798,7 @@ out:
 	} else
 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 
-	if (likely(idev != NULL))
-		in6_dev_put(idev);
+	rcu_read_unlock();
 	return;
 
 err_out:
@@ -1998,8 +1989,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc)
 				    &psf->sf_addr))
 					break;
 			if (!dpsf) {
-				dpsf = (struct ip6_sf_list *)
-					kmalloc(sizeof(*dpsf), GFP_ATOMIC);
+				dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
 				if (!dpsf)
 					continue;
 				*dpsf = *psf;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 2efef52fb46..998d6d27e7c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -91,7 +91,9 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
 
-static u32 ndisc_hash(const void *pkey, const struct net_device *dev);
+static u32 ndisc_hash(const void *pkey,
+		      const struct net_device *dev,
+		      __u32 rnd);
 static int ndisc_constructor(struct neighbour *neigh);
 static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb);
 static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -228,12 +230,12 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
 	do {
 		cur = ((void *)cur) + (cur->nd_opt_len << 3);
 	} while(cur < end && cur->nd_opt_type != type);
-	return (cur <= end && cur->nd_opt_type == type ? cur : NULL);
+	return cur <= end && cur->nd_opt_type == type ? cur : NULL;
 }
 
 static inline int ndisc_is_useropt(struct nd_opt_hdr *opt)
 {
-	return (opt->nd_opt_type == ND_OPT_RDNSS);
+	return opt->nd_opt_type == ND_OPT_RDNSS;
 }
 
 static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur,
@@ -244,7 +246,7 @@ static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur,
 	do {
 		cur = ((void *)cur) + (cur->nd_opt_len << 3);
 	} while(cur < end && !ndisc_is_useropt(cur));
-	return (cur <= end && ndisc_is_useropt(cur) ? cur : NULL);
+	return cur <= end && ndisc_is_useropt(cur) ? cur : NULL;
 }
 
 static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
@@ -319,7 +321,7 @@ static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p,
 	int prepad = ndisc_addr_option_pad(dev->type);
 	if (lladdrlen != NDISC_OPT_SPACE(dev->addr_len + prepad))
 		return NULL;
-	return (lladdr + prepad);
+	return lladdr + prepad;
 }
 
 int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int dir)
@@ -350,7 +352,9 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d
 
 EXPORT_SYMBOL(ndisc_mc_map);
 
-static u32 ndisc_hash(const void *pkey, const struct net_device *dev)
+static u32 ndisc_hash(const void *pkey,
+		      const struct net_device *dev,
+		      __u32 hash_rnd)
 {
 	const u32 *p32 = pkey;
 	u32 addr_hash, i;
@@ -359,7 +363,7 @@ static u32 ndisc_hash(const void *pkey, const struct net_device *dev)
 	for (i = 0; i < (sizeof(struct in6_addr) / sizeof(u32)); i++)
 		addr_hash ^= *p32++;
 
-	return jhash_2words(addr_hash, dev->ifindex, nd_tbl.hash_rnd);
+	return jhash_2words(addr_hash, dev->ifindex, hash_rnd);
 }
 
 static int ndisc_constructor(struct neighbour *neigh)
@@ -1105,6 +1109,18 @@ errout:
 	rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err);
 }
 
+static inline int accept_ra(struct inet6_dev *in6_dev)
+{
+	/*
+	 * If forwarding is enabled, RA are not accepted unless the special
+	 * hybrid mode (accept_ra=2) is enabled.
+	 */
+	if (in6_dev->cnf.forwarding && in6_dev->cnf.accept_ra < 2)
+		return 0;
+
+	return in6_dev->cnf.accept_ra;
+}
+
 static void ndisc_router_discovery(struct sk_buff *skb)
 {
 	struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb);
@@ -1158,8 +1174,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		return;
 	}
 
-	/* skip route and link configuration on routers */
-	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra)
+	if (!accept_ra(in6_dev))
 		goto skip_linkparms;
 
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
@@ -1229,7 +1244,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 			ND_PRINTK0(KERN_ERR
 				   "ICMPv6 RA: %s() got default router without neighbour.\n",
 				   __func__);
-			dst_release(&rt->u.dst);
+			dst_release(&rt->dst);
 			in6_dev_put(in6_dev);
 			return;
 		}
@@ -1244,7 +1259,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	if (ra_msg->icmph.icmp6_hop_limit) {
 		in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
 		if (rt)
-			rt->u.dst.metrics[RTAX_HOPLIMIT-1] = ra_msg->icmph.icmp6_hop_limit;
+			rt->dst.metrics[RTAX_HOPLIMIT-1] = ra_msg->icmph.icmp6_hop_limit;
 	}
 
 skip_defrtr:
@@ -1309,8 +1324,7 @@ skip_linkparms:
 			     NEIGH_UPDATE_F_ISROUTER);
 	}
 
-	/* skip route and link configuration on routers */
-	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra)
+	if (!accept_ra(in6_dev))
 		goto out;
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
@@ -1363,7 +1377,7 @@ skip_linkparms:
 			in6_dev->cnf.mtu6 = mtu;
 
 			if (rt)
-				rt->u.dst.metrics[RTAX_MTU-1] = mtu;
+				rt->dst.metrics[RTAX_MTU-1] = mtu;
 
 			rt6_mtu_change(skb->dev, mtu);
 		}
@@ -1384,7 +1398,7 @@ skip_linkparms:
 	}
 out:
 	if (rt)
-		dst_release(&rt->u.dst);
+		dst_release(&rt->dst);
 	else if (neigh)
 		neigh_release(neigh);
 	in6_dev_put(in6_dev);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index a74951c039b..7155b2451d7 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -151,9 +151,7 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
 							 protocol,
 							 csum_sub(0, hsum)));
 		skb->ip_summed = CHECKSUM_NONE;
-		csum = __skb_checksum_complete_head(skb, dataoff + len);
-		if (!csum)
-			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		return __skb_checksum_complete_head(skb, dataoff + len);
 	}
 	return csum;
 };
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 29d643bcafa..448464844a2 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -5,10 +5,15 @@
 menu "IPv6: Netfilter Configuration"
 	depends on INET && IPV6 && NETFILTER
 
+config NF_DEFRAG_IPV6
+	tristate
+	default n
+
 config NF_CONNTRACK_IPV6
 	tristate "IPv6 connection tracking support"
 	depends on INET && IPV6 && NF_CONNTRACK
 	default m if NETFILTER_ADVANCED=n
+	select NF_DEFRAG_IPV6
 	---help---
 	  Connection tracking keeps a record of what packets have passed
 	  through your machine, in order to figure out how they are related
@@ -132,10 +137,10 @@ config IP6_NF_MATCH_RT
 # The targets
 config IP6_NF_TARGET_HL
 	tristate '"HL" hoplimit target support'
-	depends on NETFILTER_ADVANCED
+	depends on NETFILTER_ADVANCED && IP6_NF_MANGLE
 	select NETFILTER_XT_TARGET_HL
 	---help---
-	This is a backwards-compat option for the user's convenience
+	This is a backwards-compatible option for the user's convenience
 	(e.g. when running oldconfig). It selects
 	CONFIG_NETFILTER_XT_TARGET_HL.
 
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index aafbba30c89..0a432c9b079 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -11,10 +11,14 @@ obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
 obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
 
 # objects for l3 independent conntrack
-nf_conntrack_ipv6-objs  :=  nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o nf_conntrack_reasm.o
+nf_conntrack_ipv6-objs  :=  nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
 
 # l3 independent conntrack
-obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
+obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_defrag_ipv6.o
+
+# defrag
+nf_defrag_ipv6-objs := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
+obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
 
 # matches
 obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index 8c201743d96..413ab0754e1 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -43,7 +43,7 @@ typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long);
 
 static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
 static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
-static DEFINE_RWLOCK(queue_lock);
+static DEFINE_SPINLOCK(queue_lock);
 static int peer_pid __read_mostly;
 static unsigned int copy_range __read_mostly;
 static unsigned int queue_total;
@@ -73,10 +73,10 @@ __ipq_set_mode(unsigned char mode, unsigned int range)
 		break;
 
 	case IPQ_COPY_PACKET:
-		copy_mode = mode;
+		if (range > 0xFFFF)
+			range = 0xFFFF;
 		copy_range = range;
-		if (copy_range > 0xFFFF)
-			copy_range = 0xFFFF;
+		copy_mode = mode;
 		break;
 
 	default:
@@ -102,7 +102,7 @@ ipq_find_dequeue_entry(unsigned long id)
 {
 	struct nf_queue_entry *entry = NULL, *i;
 
-	write_lock_bh(&queue_lock);
+	spin_lock_bh(&queue_lock);
 
 	list_for_each_entry(i, &queue_list, list) {
 		if ((unsigned long)i == id) {
@@ -116,7 +116,7 @@ ipq_find_dequeue_entry(unsigned long id)
 		queue_total--;
 	}
 
-	write_unlock_bh(&queue_lock);
+	spin_unlock_bh(&queue_lock);
 	return entry;
 }
 
@@ -137,9 +137,9 @@ __ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
 static void
 ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
 {
-	write_lock_bh(&queue_lock);
+	spin_lock_bh(&queue_lock);
 	__ipq_flush(cmpfn, data);
-	write_unlock_bh(&queue_lock);
+	spin_unlock_bh(&queue_lock);
 }
 
 static struct sk_buff *
@@ -153,9 +153,7 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
 	struct nlmsghdr *nlh;
 	struct timeval tv;
 
-	read_lock_bh(&queue_lock);
-
-	switch (copy_mode) {
+	switch (ACCESS_ONCE(copy_mode)) {
 	case IPQ_COPY_META:
 	case IPQ_COPY_NONE:
 		size = NLMSG_SPACE(sizeof(*pmsg));
@@ -163,26 +161,21 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
 
 	case IPQ_COPY_PACKET:
 		if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
-		    (*errp = skb_checksum_help(entry->skb))) {
-			read_unlock_bh(&queue_lock);
+		    (*errp = skb_checksum_help(entry->skb)))
 			return NULL;
-		}
-		if (copy_range == 0 || copy_range > entry->skb->len)
+
+		data_len = ACCESS_ONCE(copy_range);
+		if (data_len == 0 || data_len > entry->skb->len)
 			data_len = entry->skb->len;
-		else
-			data_len = copy_range;
 
 		size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
 		break;
 
 	default:
 		*errp = -EINVAL;
-		read_unlock_bh(&queue_lock);
 		return NULL;
 	}
 
-	read_unlock_bh(&queue_lock);
-
 	skb = alloc_skb(size, GFP_ATOMIC);
 	if (!skb)
 		goto nlmsg_failure;
@@ -242,7 +235,7 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
 	if (nskb == NULL)
 		return status;
 
-	write_lock_bh(&queue_lock);
+	spin_lock_bh(&queue_lock);
 
 	if (!peer_pid)
 		goto err_out_free_nskb;
@@ -266,14 +259,14 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
 
 	__ipq_enqueue_entry(entry);
 
-	write_unlock_bh(&queue_lock);
+	spin_unlock_bh(&queue_lock);
 	return status;
 
 err_out_free_nskb:
 	kfree_skb(nskb);
 
 err_out_unlock:
-	write_unlock_bh(&queue_lock);
+	spin_unlock_bh(&queue_lock);
 	return status;
 }
 
@@ -342,9 +335,9 @@ ipq_set_mode(unsigned char mode, unsigned int range)
 {
 	int status;
 
-	write_lock_bh(&queue_lock);
+	spin_lock_bh(&queue_lock);
 	status = __ipq_set_mode(mode, range);
-	write_unlock_bh(&queue_lock);
+	spin_unlock_bh(&queue_lock);
 	return status;
 }
 
@@ -441,11 +434,11 @@ __ipq_rcv_skb(struct sk_buff *skb)
 	if (security_netlink_recv(skb, CAP_NET_ADMIN))
 		RCV_SKB_FAIL(-EPERM);
 
-	write_lock_bh(&queue_lock);
+	spin_lock_bh(&queue_lock);
 
 	if (peer_pid) {
 		if (peer_pid != pid) {
-			write_unlock_bh(&queue_lock);
+			spin_unlock_bh(&queue_lock);
 			RCV_SKB_FAIL(-EBUSY);
 		}
 	} else {
@@ -453,7 +446,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
 		peer_pid = pid;
 	}
 
-	write_unlock_bh(&queue_lock);
+	spin_unlock_bh(&queue_lock);
 
 	status = ipq_receive_peer(NLMSG_DATA(nlh), type,
 				  nlmsglen - NLMSG_LENGTH(0));
@@ -498,10 +491,10 @@ ipq_rcv_nl_event(struct notifier_block *this,
 	struct netlink_notify *n = ptr;
 
 	if (event == NETLINK_URELEASE && n->protocol == NETLINK_IP6_FW) {
-		write_lock_bh(&queue_lock);
+		spin_lock_bh(&queue_lock);
 		if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
 			__ipq_reset();
-		write_unlock_bh(&queue_lock);
+		spin_unlock_bh(&queue_lock);
 	}
 	return NOTIFY_DONE;
 }
@@ -528,7 +521,7 @@ static ctl_table ipq_table[] = {
 #ifdef CONFIG_PROC_FS
 static int ip6_queue_show(struct seq_file *m, void *v)
 {
-	read_lock_bh(&queue_lock);
+	spin_lock_bh(&queue_lock);
 
 	seq_printf(m,
 		      "Peer PID          : %d\n"
@@ -546,7 +539,7 @@ static int ip6_queue_show(struct seq_file *m, void *v)
 		      queue_dropped,
 		      queue_user_dropped);
 
-	read_unlock_bh(&queue_lock);
+	spin_unlock_bh(&queue_lock);
 	return 0;
 }
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 9d2d68f0e60..51df035897e 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -82,13 +82,13 @@ EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table);
 int
 ip6t_ext_hdr(u8 nexthdr)
 {
-	return ( (nexthdr == IPPROTO_HOPOPTS)   ||
-		 (nexthdr == IPPROTO_ROUTING)   ||
-		 (nexthdr == IPPROTO_FRAGMENT)  ||
-		 (nexthdr == IPPROTO_ESP)       ||
-		 (nexthdr == IPPROTO_AH)        ||
-		 (nexthdr == IPPROTO_NONE)      ||
-		 (nexthdr == IPPROTO_DSTOPTS) );
+	return  (nexthdr == IPPROTO_HOPOPTS)   ||
+		(nexthdr == IPPROTO_ROUTING)   ||
+		(nexthdr == IPPROTO_FRAGMENT)  ||
+		(nexthdr == IPPROTO_ESP)       ||
+		(nexthdr == IPPROTO_AH)        ||
+		(nexthdr == IPPROTO_NONE)      ||
+		(nexthdr == IPPROTO_DSTOPTS);
 }
 
 /* Returns whether matches rule or not. */
@@ -215,7 +215,7 @@ static inline bool unconditional(const struct ip6t_ip6 *ipv6)
 	return memcmp(ipv6, &uncond, sizeof(uncond)) == 0;
 }
 
-static inline const struct ip6t_entry_target *
+static inline const struct xt_entry_target *
 ip6t_get_target_c(const struct ip6t_entry *e)
 {
 	return ip6t_get_target((struct ip6t_entry *)e);
@@ -260,9 +260,9 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
 		      const char *hookname, const char **chainname,
 		      const char **comment, unsigned int *rulenum)
 {
-	const struct ip6t_standard_target *t = (void *)ip6t_get_target_c(s);
+	const struct xt_standard_target *t = (void *)ip6t_get_target_c(s);
 
-	if (strcmp(t->target.u.kernel.target->name, IP6T_ERROR_TARGET) == 0) {
+	if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
 		/* Head of user chain: ERROR target with chainname */
 		*chainname = t->target.data;
 		(*rulenum) = 0;
@@ -271,7 +271,7 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
 
 		if (s->target_offset == sizeof(struct ip6t_entry) &&
 		    strcmp(t->target.u.kernel.target->name,
-			   IP6T_STANDARD_TARGET) == 0 &&
+			   XT_STANDARD_TARGET) == 0 &&
 		    t->verdict < 0 &&
 		    unconditional(&s->ipv6)) {
 			/* Tail of chains: STANDARD target (return/policy) */
@@ -369,7 +369,7 @@ ip6t_do_table(struct sk_buff *skb,
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	do {
-		const struct ip6t_entry_target *t;
+		const struct xt_entry_target *t;
 		const struct xt_entry_match *ematch;
 
 		IP_NF_ASSERT(e);
@@ -387,9 +387,7 @@ ip6t_do_table(struct sk_buff *skb,
 				goto no_match;
 		}
 
-		ADD_COUNTER(e->counters,
-			    ntohs(ipv6_hdr(skb)->payload_len) +
-			    sizeof(struct ipv6hdr), 1);
+		ADD_COUNTER(e->counters, skb->len, 1);
 
 		t = ip6t_get_target_c(e);
 		IP_NF_ASSERT(t->u.kernel.target);
@@ -405,10 +403,10 @@ ip6t_do_table(struct sk_buff *skb,
 		if (!t->u.kernel.target->target) {
 			int v;
 
-			v = ((struct ip6t_standard_target *)t)->verdict;
+			v = ((struct xt_standard_target *)t)->verdict;
 			if (v < 0) {
 				/* Pop from stack? */
-				if (v != IP6T_RETURN) {
+				if (v != XT_RETURN) {
 					verdict = (unsigned)(-v) - 1;
 					break;
 				}
@@ -436,7 +434,7 @@ ip6t_do_table(struct sk_buff *skb,
 		acpar.targinfo = t->data;
 
 		verdict = t->u.kernel.target->target(skb, &acpar);
-		if (verdict == IP6T_CONTINUE)
+		if (verdict == XT_CONTINUE)
 			e = ip6t_next_entry(e);
 		else
 			/* Verdict */
@@ -476,7 +474,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
 		e->counters.pcnt = pos;
 
 		for (;;) {
-			const struct ip6t_standard_target *t
+			const struct xt_standard_target *t
 				= (void *)ip6t_get_target_c(e);
 			int visited = e->comefrom & (1 << hook);
 
@@ -490,13 +488,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
 			/* Unconditional return/END. */
 			if ((e->target_offset == sizeof(struct ip6t_entry) &&
 			     (strcmp(t->target.u.user.name,
-				     IP6T_STANDARD_TARGET) == 0) &&
+				     XT_STANDARD_TARGET) == 0) &&
 			     t->verdict < 0 &&
 			     unconditional(&e->ipv6)) || visited) {
 				unsigned int oldpos, size;
 
 				if ((strcmp(t->target.u.user.name,
-					    IP6T_STANDARD_TARGET) == 0) &&
+					    XT_STANDARD_TARGET) == 0) &&
 				    t->verdict < -NF_MAX_VERDICT - 1) {
 					duprintf("mark_source_chains: bad "
 						"negative verdict (%i)\n",
@@ -539,7 +537,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
 				int newpos = t->verdict;
 
 				if (strcmp(t->target.u.user.name,
-					   IP6T_STANDARD_TARGET) == 0 &&
+					   XT_STANDARD_TARGET) == 0 &&
 				    newpos >= 0) {
 					if (newpos > newinfo->size -
 						sizeof(struct ip6t_entry)) {
@@ -567,7 +565,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
 	return 1;
 }
 
-static void cleanup_match(struct ip6t_entry_match *m, struct net *net)
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
 {
 	struct xt_mtdtor_param par;
 
@@ -583,14 +581,14 @@ static void cleanup_match(struct ip6t_entry_match *m, struct net *net)
 static int
 check_entry(const struct ip6t_entry *e, const char *name)
 {
-	const struct ip6t_entry_target *t;
+	const struct xt_entry_target *t;
 
 	if (!ip6_checkentry(&e->ipv6)) {
 		duprintf("ip_tables: ip check failed %p %s.\n", e, name);
 		return -EINVAL;
 	}
 
-	if (e->target_offset + sizeof(struct ip6t_entry_target) >
+	if (e->target_offset + sizeof(struct xt_entry_target) >
 	    e->next_offset)
 		return -EINVAL;
 
@@ -601,7 +599,7 @@ check_entry(const struct ip6t_entry *e, const char *name)
 	return 0;
 }
 
-static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par)
+static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
 {
 	const struct ip6t_ip6 *ipv6 = par->entryinfo;
 	int ret;
@@ -620,7 +618,7 @@ static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par)
 }
 
 static int
-find_check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par)
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
 {
 	struct xt_match *match;
 	int ret;
@@ -645,7 +643,7 @@ err:
 
 static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
 {
-	struct ip6t_entry_target *t = ip6t_get_target(e);
+	struct xt_entry_target *t = ip6t_get_target(e);
 	struct xt_tgchk_param par = {
 		.net       = net,
 		.table     = name,
@@ -672,7 +670,7 @@ static int
 find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
 		 unsigned int size)
 {
-	struct ip6t_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	int ret;
 	unsigned int j;
@@ -723,7 +721,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
 
 static bool check_underflow(const struct ip6t_entry *e)
 {
-	const struct ip6t_entry_target *t;
+	const struct xt_entry_target *t;
 	unsigned int verdict;
 
 	if (!unconditional(&e->ipv6))
@@ -731,7 +729,7 @@ static bool check_underflow(const struct ip6t_entry *e)
 	t = ip6t_get_target_c(e);
 	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
 		return false;
-	verdict = ((struct ip6t_standard_target *)t)->verdict;
+	verdict = ((struct xt_standard_target *)t)->verdict;
 	verdict = -verdict - 1;
 	return verdict == NF_DROP || verdict == NF_ACCEPT;
 }
@@ -754,7 +752,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
 	}
 
 	if (e->next_offset
-	    < sizeof(struct ip6t_entry) + sizeof(struct ip6t_entry_target)) {
+	    < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) {
 		duprintf("checking: element %p size %u\n",
 			 e, e->next_offset);
 		return -EINVAL;
@@ -786,7 +784,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
 static void cleanup_entry(struct ip6t_entry *e, struct net *net)
 {
 	struct xt_tgdtor_param par;
-	struct ip6t_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_entry_match *ematch;
 
 	/* Cleanup all matches */
@@ -899,7 +897,7 @@ get_counters(const struct xt_table_info *t,
 	struct ip6t_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu;
+	unsigned int curcpu = get_cpu();
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
@@ -909,19 +907,22 @@ get_counters(const struct xt_table_info *t,
 	 * if new softirq were to run and call ipt_do_table
 	 */
 	local_bh_disable();
-	curcpu = smp_processor_id();
-
 	i = 0;
 	xt_entry_foreach(iter, t->entries[curcpu], t->size) {
 		SET_COUNTER(counters[i], iter->counters.bcnt,
 			    iter->counters.pcnt);
 		++i;
 	}
+	local_bh_enable();
+	/* Processing counters from other cpus, we can let bottom half enabled,
+	 * (preemption is disabled)
+	 */
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		local_bh_disable();
 		xt_info_wrlock(cpu);
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
 			ADD_COUNTER(counters[i], iter->counters.bcnt,
@@ -929,8 +930,9 @@ get_counters(const struct xt_table_info *t,
 			++i;
 		}
 		xt_info_wrunlock(cpu);
+		local_bh_enable();
 	}
-	local_bh_enable();
+	put_cpu();
 }
 
 static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -943,7 +945,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 	   (other than comefrom, which userspace doesn't care
 	   about). */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vmalloc_node(countersize, numa_node_id());
+	counters = vmalloc(countersize);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -983,8 +985,8 @@ copy_entries_to_user(unsigned int total_size,
 	/* ... then go back and fix counters and names */
 	for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
 		unsigned int i;
-		const struct ip6t_entry_match *m;
-		const struct ip6t_entry_target *t;
+		const struct xt_entry_match *m;
+		const struct xt_entry_target *t;
 
 		e = (struct ip6t_entry *)(loc_cpu_entry + off);
 		if (copy_to_user(userptr + off
@@ -1001,7 +1003,7 @@ copy_entries_to_user(unsigned int total_size,
 			m = (void *)e + i;
 
 			if (copy_to_user(userptr + off + i
-					 + offsetof(struct ip6t_entry_match,
+					 + offsetof(struct xt_entry_match,
 						    u.user.name),
 					 m->u.kernel.match->name,
 					 strlen(m->u.kernel.match->name)+1)
@@ -1013,7 +1015,7 @@ copy_entries_to_user(unsigned int total_size,
 
 		t = ip6t_get_target_c(e);
 		if (copy_to_user(userptr + off + e->target_offset
-				 + offsetof(struct ip6t_entry_target,
+				 + offsetof(struct xt_entry_target,
 					    u.user.name),
 				 t->u.kernel.target->name,
 				 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -1051,7 +1053,7 @@ static int compat_calc_entry(const struct ip6t_entry *e,
 			     const void *base, struct xt_table_info *newinfo)
 {
 	const struct xt_entry_match *ematch;
-	const struct ip6t_entry_target *t;
+	const struct xt_entry_target *t;
 	unsigned int entry_offset;
 	int off, i, ret;
 
@@ -1103,7 +1105,7 @@ static int compat_table_info(const struct xt_table_info *info,
 static int get_info(struct net *net, void __user *user,
                     const int *len, int compat)
 {
-	char name[IP6T_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 	struct xt_table *t;
 	int ret;
 
@@ -1116,7 +1118,7 @@ static int get_info(struct net *net, void __user *user,
 	if (copy_from_user(name, user, sizeof(name)) != 0)
 		return -EFAULT;
 
-	name[IP6T_TABLE_MAXNAMELEN-1] = '\0';
+	name[XT_TABLE_MAXNAMELEN-1] = '\0';
 #ifdef CONFIG_COMPAT
 	if (compat)
 		xt_compat_lock(AF_INET6);
@@ -1213,8 +1215,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	struct ip6t_entry *iter;
 
 	ret = 0;
-	counters = vmalloc_node(num_counters * sizeof(struct xt_counters),
-				numa_node_id());
+	counters = vmalloc(num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;
@@ -1368,7 +1369,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 	if (len != size + num_counters * sizeof(struct xt_counters))
 		return -EINVAL;
 
-	paddc = vmalloc_node(len - size, numa_node_id());
+	paddc = vmalloc(len - size);
 	if (!paddc)
 		return -ENOMEM;
 
@@ -1414,14 +1415,14 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 
 #ifdef CONFIG_COMPAT
 struct compat_ip6t_replace {
-	char			name[IP6T_TABLE_MAXNAMELEN];
+	char			name[XT_TABLE_MAXNAMELEN];
 	u32			valid_hooks;
 	u32			num_entries;
 	u32			size;
 	u32			hook_entry[NF_INET_NUMHOOKS];
 	u32			underflow[NF_INET_NUMHOOKS];
 	u32			num_counters;
-	compat_uptr_t		counters;	/* struct ip6t_counters * */
+	compat_uptr_t		counters;	/* struct xt_counters * */
 	struct compat_ip6t_entry entries[0];
 };
 
@@ -1430,7 +1431,7 @@ compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr,
 			  unsigned int *size, struct xt_counters *counters,
 			  unsigned int i)
 {
-	struct ip6t_entry_target *t;
+	struct xt_entry_target *t;
 	struct compat_ip6t_entry __user *ce;
 	u_int16_t target_offset, next_offset;
 	compat_uint_t origsize;
@@ -1465,7 +1466,7 @@ compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr,
 }
 
 static int
-compat_find_calc_match(struct ip6t_entry_match *m,
+compat_find_calc_match(struct xt_entry_match *m,
 		       const char *name,
 		       const struct ip6t_ip6 *ipv6,
 		       unsigned int hookmask,
@@ -1487,7 +1488,7 @@ compat_find_calc_match(struct ip6t_entry_match *m,
 
 static void compat_release_entry(struct compat_ip6t_entry *e)
 {
-	struct ip6t_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_entry_match *ematch;
 
 	/* Cleanup all matches */
@@ -1508,7 +1509,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 				  const char *name)
 {
 	struct xt_entry_match *ematch;
-	struct ip6t_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	unsigned int entry_offset;
 	unsigned int j;
@@ -1590,7 +1591,7 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
 			    unsigned int *size, const char *name,
 			    struct xt_table_info *newinfo, unsigned char *base)
 {
-	struct ip6t_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	struct ip6t_entry *de;
 	unsigned int origsize;
@@ -1765,6 +1766,9 @@ translate_compat_table(struct net *net,
 		if (ret != 0)
 			break;
 		++i;
+		if (strcmp(ip6t_get_target(iter1)->u.user.name,
+		    XT_ERROR_TARGET) == 0)
+			++newinfo->stacksize;
 	}
 	if (ret) {
 		/*
@@ -1895,7 +1899,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
 }
 
 struct compat_ip6t_get_entries {
-	char name[IP6T_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 	compat_uint_t size;
 	struct compat_ip6t_entry entrytable[0];
 };
@@ -2050,7 +2054,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 
 	case IP6T_SO_GET_REVISION_MATCH:
 	case IP6T_SO_GET_REVISION_TARGET: {
-		struct ip6t_get_revision rev;
+		struct xt_get_revision rev;
 		int target;
 
 		if (*len != sizeof(rev)) {
@@ -2187,7 +2191,7 @@ static int icmp6_checkentry(const struct xt_mtchk_param *par)
 /* The built-in targets: standard (NULL) and error. */
 static struct xt_target ip6t_builtin_tg[] __read_mostly = {
 	{
-		.name             = IP6T_STANDARD_TARGET,
+		.name             = XT_STANDARD_TARGET,
 		.targetsize       = sizeof(int),
 		.family           = NFPROTO_IPV6,
 #ifdef CONFIG_COMPAT
@@ -2197,9 +2201,9 @@ static struct xt_target ip6t_builtin_tg[] __read_mostly = {
 #endif
 	},
 	{
-		.name             = IP6T_ERROR_TARGET,
+		.name             = XT_ERROR_TARGET,
 		.target           = ip6t_error,
-		.targetsize       = IP6T_FUNCTION_MAXNAMELEN,
+		.targetsize       = XT_FUNCTION_MAXNAMELEN,
 		.family           = NFPROTO_IPV6,
 	},
 };
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index af4ee11f206..09c88891a75 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -23,6 +23,7 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/netfilter/nf_log.h>
+#include <net/netfilter/xt_log.h>
 
 MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
 MODULE_DESCRIPTION("Xtables: IPv6 packet logging to syslog");
@@ -32,11 +33,9 @@ struct in_device;
 #include <net/route.h>
 #include <linux/netfilter_ipv6/ip6t_LOG.h>
 
-/* Use lock to serialize, so printks don't overlap */
-static DEFINE_SPINLOCK(log_lock);
-
 /* One level of recursion won't kill us */
-static void dump_packet(const struct nf_loginfo *info,
+static void dump_packet(struct sbuff *m,
+			const struct nf_loginfo *info,
 			const struct sk_buff *skb, unsigned int ip6hoff,
 			int recurse)
 {
@@ -55,15 +54,15 @@ static void dump_packet(const struct nf_loginfo *info,
 
 	ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
 	if (ih == NULL) {
-		printk("TRUNCATED");
+		sb_add(m, "TRUNCATED");
 		return;
 	}
 
 	/* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
-	printk("SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
+	sb_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
 
 	/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
-	printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+	sb_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
 	       ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
 	       (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
 	       ih->hop_limit,
@@ -78,35 +77,35 @@ static void dump_packet(const struct nf_loginfo *info,
 
 		hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
 		if (hp == NULL) {
-			printk("TRUNCATED");
+			sb_add(m, "TRUNCATED");
 			return;
 		}
 
 		/* Max length: 48 "OPT (...) " */
 		if (logflags & IP6T_LOG_IPOPT)
-			printk("OPT ( ");
+			sb_add(m, "OPT ( ");
 
 		switch (currenthdr) {
 		case IPPROTO_FRAGMENT: {
 			struct frag_hdr _fhdr;
 			const struct frag_hdr *fh;
 
-			printk("FRAG:");
+			sb_add(m, "FRAG:");
 			fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
 						&_fhdr);
 			if (fh == NULL) {
-				printk("TRUNCATED ");
+				sb_add(m, "TRUNCATED ");
 				return;
 			}
 
 			/* Max length: 6 "65535 " */
-			printk("%u ", ntohs(fh->frag_off) & 0xFFF8);
+			sb_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8);
 
 			/* Max length: 11 "INCOMPLETE " */
 			if (fh->frag_off & htons(0x0001))
-				printk("INCOMPLETE ");
+				sb_add(m, "INCOMPLETE ");
 
-			printk("ID:%08x ", ntohl(fh->identification));
+			sb_add(m, "ID:%08x ", ntohl(fh->identification));
 
 			if (ntohs(fh->frag_off) & 0xFFF8)
 				fragment = 1;
@@ -120,7 +119,7 @@ static void dump_packet(const struct nf_loginfo *info,
 		case IPPROTO_HOPOPTS:
 			if (fragment) {
 				if (logflags & IP6T_LOG_IPOPT)
-					printk(")");
+					sb_add(m, ")");
 				return;
 			}
 			hdrlen = ipv6_optlen(hp);
@@ -132,10 +131,10 @@ static void dump_packet(const struct nf_loginfo *info,
 				const struct ip_auth_hdr *ah;
 
 				/* Max length: 3 "AH " */
-				printk("AH ");
+				sb_add(m, "AH ");
 
 				if (fragment) {
-					printk(")");
+					sb_add(m, ")");
 					return;
 				}
 
@@ -146,13 +145,13 @@ static void dump_packet(const struct nf_loginfo *info,
 					 * Max length: 26 "INCOMPLETE [65535
 					 *  bytes] )"
 					 */
-					printk("INCOMPLETE [%u bytes] )",
+					sb_add(m, "INCOMPLETE [%u bytes] )",
 					       skb->len - ptr);
 					return;
 				}
 
 				/* Length: 15 "SPI=0xF1234567 */
-				printk("SPI=0x%x ", ntohl(ah->spi));
+				sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
 
 			}
 
@@ -164,10 +163,10 @@ static void dump_packet(const struct nf_loginfo *info,
 				const struct ip_esp_hdr *eh;
 
 				/* Max length: 4 "ESP " */
-				printk("ESP ");
+				sb_add(m, "ESP ");
 
 				if (fragment) {
-					printk(")");
+					sb_add(m, ")");
 					return;
 				}
 
@@ -177,23 +176,23 @@ static void dump_packet(const struct nf_loginfo *info,
 				eh = skb_header_pointer(skb, ptr, sizeof(_esph),
 							&_esph);
 				if (eh == NULL) {
-					printk("INCOMPLETE [%u bytes] )",
+					sb_add(m, "INCOMPLETE [%u bytes] )",
 					       skb->len - ptr);
 					return;
 				}
 
 				/* Length: 16 "SPI=0xF1234567 )" */
-				printk("SPI=0x%x )", ntohl(eh->spi) );
+				sb_add(m, "SPI=0x%x )", ntohl(eh->spi) );
 
 			}
 			return;
 		default:
 			/* Max length: 20 "Unknown Ext Hdr 255" */
-			printk("Unknown Ext Hdr %u", currenthdr);
+			sb_add(m, "Unknown Ext Hdr %u", currenthdr);
 			return;
 		}
 		if (logflags & IP6T_LOG_IPOPT)
-			printk(") ");
+			sb_add(m, ") ");
 
 		currenthdr = hp->nexthdr;
 		ptr += hdrlen;
@@ -205,7 +204,7 @@ static void dump_packet(const struct nf_loginfo *info,
 		const struct tcphdr *th;
 
 		/* Max length: 10 "PROTO=TCP " */
-		printk("PROTO=TCP ");
+		sb_add(m, "PROTO=TCP ");
 
 		if (fragment)
 			break;
@@ -213,40 +212,40 @@ static void dump_packet(const struct nf_loginfo *info,
 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
 		th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph);
 		if (th == NULL) {
-			printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+			sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
 			return;
 		}
 
 		/* Max length: 20 "SPT=65535 DPT=65535 " */
-		printk("SPT=%u DPT=%u ",
+		sb_add(m, "SPT=%u DPT=%u ",
 		       ntohs(th->source), ntohs(th->dest));
 		/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
 		if (logflags & IP6T_LOG_TCPSEQ)
-			printk("SEQ=%u ACK=%u ",
+			sb_add(m, "SEQ=%u ACK=%u ",
 			       ntohl(th->seq), ntohl(th->ack_seq));
 		/* Max length: 13 "WINDOW=65535 " */
-		printk("WINDOW=%u ", ntohs(th->window));
+		sb_add(m, "WINDOW=%u ", ntohs(th->window));
 		/* Max length: 9 "RES=0x3C " */
-		printk("RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+		sb_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
 		/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
 		if (th->cwr)
-			printk("CWR ");
+			sb_add(m, "CWR ");
 		if (th->ece)
-			printk("ECE ");
+			sb_add(m, "ECE ");
 		if (th->urg)
-			printk("URG ");
+			sb_add(m, "URG ");
 		if (th->ack)
-			printk("ACK ");
+			sb_add(m, "ACK ");
 		if (th->psh)
-			printk("PSH ");
+			sb_add(m, "PSH ");
 		if (th->rst)
-			printk("RST ");
+			sb_add(m, "RST ");
 		if (th->syn)
-			printk("SYN ");
+			sb_add(m, "SYN ");
 		if (th->fin)
-			printk("FIN ");
+			sb_add(m, "FIN ");
 		/* Max length: 11 "URGP=65535 " */
-		printk("URGP=%u ", ntohs(th->urg_ptr));
+		sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
 
 		if ((logflags & IP6T_LOG_TCPOPT) &&
 		    th->doff * 4 > sizeof(struct tcphdr)) {
@@ -260,15 +259,15 @@ static void dump_packet(const struct nf_loginfo *info,
 						ptr + sizeof(struct tcphdr),
 						optsize, _opt);
 			if (op == NULL) {
-				printk("OPT (TRUNCATED)");
+				sb_add(m, "OPT (TRUNCATED)");
 				return;
 			}
 
 			/* Max length: 127 "OPT (" 15*4*2chars ") " */
-			printk("OPT (");
+			sb_add(m, "OPT (");
 			for (i =0; i < optsize; i++)
-				printk("%02X", op[i]);
-			printk(") ");
+				sb_add(m, "%02X", op[i]);
+			sb_add(m, ") ");
 		}
 		break;
 	}
@@ -279,9 +278,9 @@ static void dump_packet(const struct nf_loginfo *info,
 
 		if (currenthdr == IPPROTO_UDP)
 			/* Max length: 10 "PROTO=UDP "     */
-			printk("PROTO=UDP " );
+			sb_add(m, "PROTO=UDP " );
 		else	/* Max length: 14 "PROTO=UDPLITE " */
-			printk("PROTO=UDPLITE ");
+			sb_add(m, "PROTO=UDPLITE ");
 
 		if (fragment)
 			break;
@@ -289,12 +288,12 @@ static void dump_packet(const struct nf_loginfo *info,
 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
 		uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph);
 		if (uh == NULL) {
-			printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+			sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
 			return;
 		}
 
 		/* Max length: 20 "SPT=65535 DPT=65535 " */
-		printk("SPT=%u DPT=%u LEN=%u ",
+		sb_add(m, "SPT=%u DPT=%u LEN=%u ",
 		       ntohs(uh->source), ntohs(uh->dest),
 		       ntohs(uh->len));
 		break;
@@ -304,7 +303,7 @@ static void dump_packet(const struct nf_loginfo *info,
 		const struct icmp6hdr *ic;
 
 		/* Max length: 13 "PROTO=ICMPv6 " */
-		printk("PROTO=ICMPv6 ");
+		sb_add(m, "PROTO=ICMPv6 ");
 
 		if (fragment)
 			break;
@@ -312,18 +311,18 @@ static void dump_packet(const struct nf_loginfo *info,
 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
 		ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
 		if (ic == NULL) {
-			printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+			sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
 			return;
 		}
 
 		/* Max length: 18 "TYPE=255 CODE=255 " */
-		printk("TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code);
+		sb_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code);
 
 		switch (ic->icmp6_type) {
 		case ICMPV6_ECHO_REQUEST:
 		case ICMPV6_ECHO_REPLY:
 			/* Max length: 19 "ID=65535 SEQ=65535 " */
-			printk("ID=%u SEQ=%u ",
+			sb_add(m, "ID=%u SEQ=%u ",
 				ntohs(ic->icmp6_identifier),
 				ntohs(ic->icmp6_sequence));
 			break;
@@ -334,35 +333,35 @@ static void dump_packet(const struct nf_loginfo *info,
 
 		case ICMPV6_PARAMPROB:
 			/* Max length: 17 "POINTER=ffffffff " */
-			printk("POINTER=%08x ", ntohl(ic->icmp6_pointer));
+			sb_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer));
 			/* Fall through */
 		case ICMPV6_DEST_UNREACH:
 		case ICMPV6_PKT_TOOBIG:
 		case ICMPV6_TIME_EXCEED:
 			/* Max length: 3+maxlen */
 			if (recurse) {
-				printk("[");
-				dump_packet(info, skb, ptr + sizeof(_icmp6h),
-					    0);
-				printk("] ");
+				sb_add(m, "[");
+				dump_packet(m, info, skb,
+					    ptr + sizeof(_icmp6h), 0);
+				sb_add(m, "] ");
 			}
 
 			/* Max length: 10 "MTU=65535 " */
 			if (ic->icmp6_type == ICMPV6_PKT_TOOBIG)
-				printk("MTU=%u ", ntohl(ic->icmp6_mtu));
+				sb_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu));
 		}
 		break;
 	}
 	/* Max length: 10 "PROTO=255 " */
 	default:
-		printk("PROTO=%u ", currenthdr);
+		sb_add(m, "PROTO=%u ", currenthdr);
 	}
 
 	/* Max length: 15 "UID=4294967295 " */
 	if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) {
 		read_lock_bh(&skb->sk->sk_callback_lock);
 		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
-			printk("UID=%u GID=%u ",
+			sb_add(m, "UID=%u GID=%u ",
 				skb->sk->sk_socket->file->f_cred->fsuid,
 				skb->sk->sk_socket->file->f_cred->fsgid);
 		read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -370,7 +369,58 @@ static void dump_packet(const struct nf_loginfo *info,
 
 	/* Max length: 16 "MARK=0xFFFFFFFF " */
 	if (!recurse && skb->mark)
-		printk("MARK=0x%x ", skb->mark);
+		sb_add(m, "MARK=0x%x ", skb->mark);
+}
+
+static void dump_mac_header(struct sbuff *m,
+			    const struct nf_loginfo *info,
+			    const struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	unsigned int logflags = 0;
+
+	if (info->type == NF_LOG_TYPE_LOG)
+		logflags = info->u.log.logflags;
+
+	if (!(logflags & IP6T_LOG_MACDECODE))
+		goto fallback;
+
+	switch (dev->type) {
+	case ARPHRD_ETHER:
+		sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+		       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+		       ntohs(eth_hdr(skb)->h_proto));
+		return;
+	default:
+		break;
+	}
+
+fallback:
+	sb_add(m, "MAC=");
+	if (dev->hard_header_len &&
+	    skb->mac_header != skb->network_header) {
+		const unsigned char *p = skb_mac_header(skb);
+		unsigned int len = dev->hard_header_len;
+		unsigned int i;
+
+		if (dev->type == ARPHRD_SIT &&
+		    (p -= ETH_HLEN) < skb->head)
+			p = NULL;
+
+		if (p != NULL) {
+			sb_add(m, "%02x", *p++);
+			for (i = 1; i < len; i++)
+				sb_add(m, ":%02x", p[i]);
+		}
+		sb_add(m, " ");
+
+		if (dev->type == ARPHRD_SIT) {
+			const struct iphdr *iph =
+				(struct iphdr *)skb_mac_header(skb);
+			sb_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr);
+		}
+	} else
+		sb_add(m, " ");
 }
 
 static struct nf_loginfo default_loginfo = {
@@ -392,47 +442,23 @@ ip6t_log_packet(u_int8_t pf,
 		const struct nf_loginfo *loginfo,
 		const char *prefix)
 {
+	struct sbuff *m = sb_open();
+
 	if (!loginfo)
 		loginfo = &default_loginfo;
 
-	spin_lock_bh(&log_lock);
-	printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
-		prefix,
-		in ? in->name : "",
-		out ? out->name : "");
-	if (in && !out) {
-		unsigned int len;
-		/* MAC logging for input chain only. */
-		printk("MAC=");
-		if (skb->dev && (len = skb->dev->hard_header_len) &&
-		    skb->mac_header != skb->network_header) {
-			const unsigned char *p = skb_mac_header(skb);
-			int i;
-
-			if (skb->dev->type == ARPHRD_SIT &&
-			    (p -= ETH_HLEN) < skb->head)
-				p = NULL;
-
-			if (p != NULL) {
-				for (i = 0; i < len; i++)
-					printk("%02x%s", p[i],
-					       i == len - 1 ? "" : ":");
-			}
-			printk(" ");
+	sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+	       prefix,
+	       in ? in->name : "",
+	       out ? out->name : "");
 
-			if (skb->dev->type == ARPHRD_SIT) {
-				const struct iphdr *iph =
-					(struct iphdr *)skb_mac_header(skb);
-				printk("TUNNEL=%pI4->%pI4 ",
-				       &iph->saddr, &iph->daddr);
-			}
-		} else
-			printk(" ");
-	}
+	/* MAC logging for input path only. */
+	if (in && !out)
+		dump_mac_header(m, loginfo, skb);
+
+	dump_packet(m, loginfo, skb, skb_network_offset(skb), 1);
 
-	dump_packet(loginfo, skb, skb_network_offset(skb), 1);
-	printk("\n");
-	spin_unlock_bh(&log_lock);
+	sb_close(m);
 }
 
 static unsigned int
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index ff43461704b..c8af58b2256 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -16,7 +16,6 @@
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <linux/icmp.h>
-#include <linux/sysctl.h>
 #include <net/ipv6.h>
 #include <net/inet_frag.h>
 
@@ -29,6 +28,7 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 #include <net/netfilter/nf_log.h>
 
 static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
@@ -189,53 +189,6 @@ out:
 	return nf_conntrack_confirm(skb);
 }
 
-static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
-						struct sk_buff *skb)
-{
-	u16 zone = NF_CT_DEFAULT_ZONE;
-
-	if (skb->nfct)
-		zone = nf_ct_zone((struct nf_conn *)skb->nfct);
-
-#ifdef CONFIG_BRIDGE_NETFILTER
-	if (skb->nf_bridge &&
-	    skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
-		return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
-#endif
-	if (hooknum == NF_INET_PRE_ROUTING)
-		return IP6_DEFRAG_CONNTRACK_IN + zone;
-	else
-		return IP6_DEFRAG_CONNTRACK_OUT + zone;
-
-}
-
-static unsigned int ipv6_defrag(unsigned int hooknum,
-				struct sk_buff *skb,
-				const struct net_device *in,
-				const struct net_device *out,
-				int (*okfn)(struct sk_buff *))
-{
-	struct sk_buff *reasm;
-
-	/* Previously seen (loopback)?  */
-	if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
-		return NF_ACCEPT;
-
-	reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb));
-	/* queued */
-	if (reasm == NULL)
-		return NF_STOLEN;
-
-	/* error occured or not fragmented */
-	if (reasm == skb)
-		return NF_ACCEPT;
-
-	nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in,
-			   (struct net_device *)out, okfn);
-
-	return NF_STOLEN;
-}
-
 static unsigned int __ipv6_conntrack_in(struct net *net,
 					unsigned int hooknum,
 					struct sk_buff *skb,
@@ -288,13 +241,6 @@ static unsigned int ipv6_conntrack_local(unsigned int hooknum,
 
 static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
 	{
-		.hook		= ipv6_defrag,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV6,
-		.hooknum	= NF_INET_PRE_ROUTING,
-		.priority	= NF_IP6_PRI_CONNTRACK_DEFRAG,
-	},
-	{
 		.hook		= ipv6_conntrack_in,
 		.owner		= THIS_MODULE,
 		.pf		= NFPROTO_IPV6,
@@ -309,13 +255,6 @@ static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
 		.priority	= NF_IP6_PRI_CONNTRACK,
 	},
 	{
-		.hook		= ipv6_defrag,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV6,
-		.hooknum	= NF_INET_LOCAL_OUT,
-		.priority	= NF_IP6_PRI_CONNTRACK_DEFRAG,
-	},
-	{
 		.hook		= ipv6_confirm,
 		.owner		= THIS_MODULE,
 		.pf		= NFPROTO_IPV6,
@@ -387,10 +326,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
 	.nlattr_to_tuple	= ipv6_nlattr_to_tuple,
 	.nla_policy		= ipv6_nla_policy,
 #endif
-#ifdef CONFIG_SYSCTL
-	.ctl_table_path		= nf_net_netfilter_sysctl_path,
-	.ctl_table		= nf_ct_ipv6_sysctl_table,
-#endif
 	.me			= THIS_MODULE,
 };
 
@@ -403,16 +338,12 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
 	int ret = 0;
 
 	need_conntrack();
+	nf_defrag_ipv6_enable();
 
-	ret = nf_ct_frag6_init();
-	if (ret < 0) {
-		pr_err("nf_conntrack_ipv6: can't initialize frag6.\n");
-		return ret;
-	}
 	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6);
 	if (ret < 0) {
 		pr_err("nf_conntrack_ipv6: can't register tcp.\n");
-		goto cleanup_frag6;
+		return ret;
 	}
 
 	ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6);
@@ -450,8 +381,6 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
 	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6);
  cleanup_tcp:
 	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
- cleanup_frag6:
-	nf_ct_frag6_cleanup();
 	return ret;
 }
 
@@ -463,7 +392,6 @@ static void __exit nf_conntrack_l3proto_ipv6_fini(void)
 	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6);
 	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6);
 	nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6);
-	nf_ct_frag6_cleanup();
 }
 
 module_init(nf_conntrack_l3proto_ipv6_init);
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 9be81776415..1df3c8b6bf4 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -208,7 +208,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	type = icmp6h->icmp6_type - 130;
 	if (type >= 0 && type < sizeof(noct_valid_new) &&
 	    noct_valid_new[type]) {
-		skb->nfct = &nf_conntrack_untracked.ct_general;
+		skb->nfct = &nf_ct_untracked_get()->ct_general;
 		skb->nfctinfo = IP_CT_NEW;
 		nf_conntrack_get(skb->nfct);
 		return NF_ACCEPT;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 6fb890187de..3a3f129a44c 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -73,7 +73,7 @@ static struct inet_frags nf_frags;
 static struct netns_frags nf_init_frags;
 
 #ifdef CONFIG_SYSCTL
-struct ctl_table nf_ct_ipv6_sysctl_table[] = {
+struct ctl_table nf_ct_frag6_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_frag6_timeout",
 		.data		= &nf_init_frags.timeout,
@@ -97,6 +97,8 @@ struct ctl_table nf_ct_ipv6_sysctl_table[] = {
 	},
 	{ }
 };
+
+static struct ctl_table_header *nf_ct_frag6_sysctl_header;
 #endif
 
 static unsigned int nf_hashfn(struct inet_frag_queue *q)
@@ -113,16 +115,6 @@ static void nf_skb_free(struct sk_buff *skb)
 		kfree_skb(NFCT_FRAG6_CB(skb)->orig);
 }
 
-/* Memory Tracking Functions. */
-static inline void frag_kfree_skb(struct sk_buff *skb, unsigned int *work)
-{
-	if (work)
-		*work -= skb->truesize;
-	atomic_sub(skb->truesize, &nf_init_frags.mem);
-	nf_skb_free(skb);
-	kfree_skb(skb);
-}
-
 /* Destruction primitives. */
 
 static __inline__ void fq_put(struct nf_ct_frag6_queue *fq)
@@ -201,7 +193,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 	int offset, end;
 
 	if (fq->q.last_in & INET_FRAG_COMPLETE) {
-		pr_debug("Allready completed\n");
+		pr_debug("Already completed\n");
 		goto err;
 	}
 
@@ -271,6 +263,11 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 	 * in the chain of fragments so far.  We must know where to put
 	 * this fragment, right?
 	 */
+	prev = fq->q.fragments_tail;
+	if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) {
+		next = NULL;
+		goto found;
+	}
 	prev = NULL;
 	for (next = fq->q.fragments; next != NULL; next = next->next) {
 		if (NFCT_FRAG6_CB(next)->offset >= offset)
@@ -278,71 +275,30 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 		prev = next;
 	}
 
-	/* We found where to put this one.  Check for overlap with
-	 * preceding fragment, and, if needed, align things so that
-	 * any overlaps are eliminated.
+found:
+	/* RFC5722, Section 4:
+	 *                                  When reassembling an IPv6 datagram, if
+	 *   one or more its constituent fragments is determined to be an
+	 *   overlapping fragment, the entire datagram (and any constituent
+	 *   fragments, including those not yet received) MUST be silently
+	 *   discarded.
 	 */
-	if (prev) {
-		int i = (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset;
-
-		if (i > 0) {
-			offset += i;
-			if (end <= offset) {
-				pr_debug("overlap\n");
-				goto err;
-			}
-			if (!pskb_pull(skb, i)) {
-				pr_debug("Can't pull\n");
-				goto err;
-			}
-			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
-				skb->ip_summed = CHECKSUM_NONE;
-		}
-	}
-
-	/* Look for overlap with succeeding segments.
-	 * If we can merge fragments, do it.
-	 */
-	while (next && NFCT_FRAG6_CB(next)->offset < end) {
-		/* overlap is 'i' bytes */
-		int i = end - NFCT_FRAG6_CB(next)->offset;
-
-		if (i < next->len) {
-			/* Eat head of the next overlapped fragment
-			 * and leave the loop. The next ones cannot overlap.
-			 */
-			pr_debug("Eat head of the overlapped parts.: %d", i);
-			if (!pskb_pull(next, i))
-				goto err;
-
-			/* next fragment */
-			NFCT_FRAG6_CB(next)->offset += i;
-			fq->q.meat -= i;
-			if (next->ip_summed != CHECKSUM_UNNECESSARY)
-				next->ip_summed = CHECKSUM_NONE;
-			break;
-		} else {
-			struct sk_buff *free_it = next;
-
-			/* Old fragmnet is completely overridden with
-			 * new one drop it.
-			 */
-			next = next->next;
 
-			if (prev)
-				prev->next = next;
-			else
-				fq->q.fragments = next;
+	/* Check for overlap with preceding fragment. */
+	if (prev &&
+	    (NFCT_FRAG6_CB(prev)->offset + prev->len) - offset > 0)
+		goto discard_fq;
 
-			fq->q.meat -= free_it->len;
-			frag_kfree_skb(free_it, NULL);
-		}
-	}
+	/* Look for overlap with succeeding segment. */
+	if (next && NFCT_FRAG6_CB(next)->offset < end)
+		goto discard_fq;
 
 	NFCT_FRAG6_CB(skb)->offset = offset;
 
 	/* Insert this fragment in the chain of fragments. */
 	skb->next = next;
+	if (!next)
+		fq->q.fragments_tail = skb;
 	if (prev)
 		prev->next = skb;
 	else
@@ -365,6 +321,8 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 	write_unlock(&nf_frags.lock);
 	return 0;
 
+discard_fq:
+	fq_kill(fq);
 err:
 	return -1;
 }
@@ -407,7 +365,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
 	/* If the first fragment is fragmented itself, we split
 	 * it to two chunks: the first with data and paged part
 	 * and the second, holding only fragments. */
-	if (skb_has_frags(head)) {
+	if (skb_has_frag_list(head)) {
 		struct sk_buff *clone;
 		int i, plen = 0;
 
@@ -442,7 +400,6 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
 	skb_shinfo(head)->frag_list = head->next;
 	skb_reset_transport_header(head);
 	skb_push(head, head->data - skb_network_header(head));
-	atomic_sub(head->truesize, &nf_init_frags.mem);
 
 	for (fp=head->next; fp; fp = fp->next) {
 		head->data_len += fp->len;
@@ -452,8 +409,8 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
 		else if (head->ip_summed == CHECKSUM_COMPLETE)
 			head->csum = csum_add(head->csum, fp->csum);
 		head->truesize += fp->truesize;
-		atomic_sub(fp->truesize, &nf_init_frags.mem);
 	}
+	atomic_sub(head->truesize, &nf_init_frags.mem);
 
 	head->next = NULL;
 	head->dev = dev;
@@ -467,6 +424,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
 					  head->csum);
 
 	fq->q.fragments = NULL;
+	fq->q.fragments_tail = NULL;
 
 	/* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */
 	fp = skb_shinfo(head)->frag_list;
@@ -667,11 +625,24 @@ int nf_ct_frag6_init(void)
 	inet_frags_init_net(&nf_init_frags);
 	inet_frags_init(&nf_frags);
 
+#ifdef CONFIG_SYSCTL
+	nf_ct_frag6_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path,
+							  nf_ct_frag6_sysctl_table);
+	if (!nf_ct_frag6_sysctl_header) {
+		inet_frags_fini(&nf_frags);
+		return -ENOMEM;
+	}
+#endif
+
 	return 0;
 }
 
 void nf_ct_frag6_cleanup(void)
 {
+#ifdef CONFIG_SYSCTL
+	unregister_sysctl_table(nf_ct_frag6_sysctl_header);
+	nf_ct_frag6_sysctl_header = NULL;
+#endif
 	inet_frags_fini(&nf_frags);
 
 	nf_init_frags.low_thresh = 0;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
new file mode 100644
index 00000000000..99abfb53bab
--- /dev/null
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -0,0 +1,131 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/ipv6.h>
+#include <linux/in6.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/ipv6.h>
+#include <net/inet_frag.h>
+
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
+static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
+						struct sk_buff *skb)
+{
+	u16 zone = NF_CT_DEFAULT_ZONE;
+
+	if (skb->nfct)
+		zone = nf_ct_zone((struct nf_conn *)skb->nfct);
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if (skb->nf_bridge &&
+	    skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
+		return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
+#endif
+	if (hooknum == NF_INET_PRE_ROUTING)
+		return IP6_DEFRAG_CONNTRACK_IN + zone;
+	else
+		return IP6_DEFRAG_CONNTRACK_OUT + zone;
+
+}
+
+static unsigned int ipv6_defrag(unsigned int hooknum,
+				struct sk_buff *skb,
+				const struct net_device *in,
+				const struct net_device *out,
+				int (*okfn)(struct sk_buff *))
+{
+	struct sk_buff *reasm;
+
+	/* Previously seen (loopback)?	*/
+	if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
+		return NF_ACCEPT;
+
+	reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb));
+	/* queued */
+	if (reasm == NULL)
+		return NF_STOLEN;
+
+	/* error occured or not fragmented */
+	if (reasm == skb)
+		return NF_ACCEPT;
+
+	nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in,
+			   (struct net_device *)out, okfn);
+
+	return NF_STOLEN;
+}
+
+static struct nf_hook_ops ipv6_defrag_ops[] = {
+	{
+		.hook		= ipv6_defrag,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP6_PRI_CONNTRACK_DEFRAG,
+	},
+	{
+		.hook		= ipv6_defrag,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP6_PRI_CONNTRACK_DEFRAG,
+	},
+};
+
+static int __init nf_defrag_init(void)
+{
+	int ret = 0;
+
+	ret = nf_ct_frag6_init();
+	if (ret < 0) {
+		pr_err("nf_defrag_ipv6: can't initialize frag6.\n");
+		return ret;
+	}
+	ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
+	if (ret < 0) {
+		pr_err("nf_defrag_ipv6: can't register hooks\n");
+		goto cleanup_frag6;
+	}
+	return ret;
+
+cleanup_frag6:
+	nf_ct_frag6_cleanup();
+	return ret;
+
+}
+
+static void __exit nf_defrag_fini(void)
+{
+	nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops));
+	nf_ct_frag6_cleanup();
+}
+
+void nf_defrag_ipv6_enable(void)
+{
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable);
+
+module_init(nf_defrag_init);
+module_exit(nf_defrag_fini);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 566798d69f3..24b3558b8e6 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -126,6 +126,8 @@ static const struct snmp_mib snmp6_udp6_list[] = {
 	SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS),
 	SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS),
 	SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS),
+	SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS),
+	SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS),
 	SNMP_MIB_SENTINEL
 };
 
@@ -134,6 +136,8 @@ static const struct snmp_mib snmp6_udplite6_list[] = {
 	SNMP_MIB_ITEM("UdpLite6NoPorts", UDP_MIB_NOPORTS),
 	SNMP_MIB_ITEM("UdpLite6InErrors", UDP_MIB_INERRORS),
 	SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS),
+	SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS),
+	SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS),
 	SNMP_MIB_SENTINEL
 };
 
@@ -174,17 +178,28 @@ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu **mib,
 				const struct snmp_mib *itemlist)
 {
 	int i;
-	for (i=0; itemlist[i].name; i++)
+
+	for (i = 0; itemlist[i].name; i++)
 		seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name,
 			   snmp_fold_field(mib, itemlist[i].entry));
 }
 
+static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu **mib,
+				  const struct snmp_mib *itemlist, size_t syncpoff)
+{
+	int i;
+
+	for (i = 0; itemlist[i].name; i++)
+		seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name,
+			   snmp_fold_field64(mib, itemlist[i].entry, syncpoff));
+}
+
 static int snmp6_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = (struct net *)seq->private;
 
-	snmp6_seq_show_item(seq, (void __percpu **)net->mib.ipv6_statistics,
-			    snmp6_ipstats_list);
+	snmp6_seq_show_item64(seq, (void __percpu **)net->mib.ipv6_statistics,
+			    snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp));
 	snmp6_seq_show_item(seq, (void __percpu **)net->mib.icmpv6_statistics,
 			    snmp6_icmp6_list);
 	snmp6_seq_show_icmpv6msg(seq,
diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c
index 1fa3468f0f3..9a7978fdc02 100644
--- a/net/ipv6/protocol.c
+++ b/net/ipv6/protocol.c
@@ -25,28 +25,15 @@
 #include <linux/spinlock.h>
 #include <net/protocol.h>
 
-const struct inet6_protocol *inet6_protos[MAX_INET_PROTOS];
-static DEFINE_SPINLOCK(inet6_proto_lock);
-
+const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly;
 
 int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol)
 {
-	int ret, hash = protocol & (MAX_INET_PROTOS - 1);
-
-	spin_lock_bh(&inet6_proto_lock);
-
-	if (inet6_protos[hash]) {
-		ret = -1;
-	} else {
-		inet6_protos[hash] = prot;
-		ret = 0;
-	}
-
-	spin_unlock_bh(&inet6_proto_lock);
+	int hash = protocol & (MAX_INET_PROTOS - 1);
 
-	return ret;
+	return !cmpxchg((const struct inet6_protocol **)&inet6_protos[hash],
+			NULL, prot) ? 0 : -1;
 }
-
 EXPORT_SYMBOL(inet6_add_protocol);
 
 /*
@@ -57,20 +44,11 @@ int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol
 {
 	int ret, hash = protocol & (MAX_INET_PROTOS - 1);
 
-	spin_lock_bh(&inet6_proto_lock);
-
-	if (inet6_protos[hash] != prot) {
-		ret = -1;
-	} else {
-		inet6_protos[hash] = NULL;
-		ret = 0;
-	}
-
-	spin_unlock_bh(&inet6_proto_lock);
+	ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[hash],
+		       prot, NULL) == prot) ? 0 : -1;
 
 	synchronize_net();
 
 	return ret;
 }
-
 EXPORT_SYMBOL(inet6_del_protocol);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 4a4dcbe4f8b..86c39526ba5 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -373,7 +373,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
 
 static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb)
 {
-	if ((raw6_sk(sk)->checksum || sk->sk_filter) &&
+	if ((raw6_sk(sk)->checksum || rcu_dereference_raw(sk->sk_filter)) &&
 	    skb_checksum_complete(skb)) {
 		atomic_inc(&sk->sk_drops);
 		kfree_skb(skb);
@@ -602,31 +602,33 @@ out:
 }
 
 static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
-			struct flowi *fl, struct rt6_info *rt,
+			struct flowi *fl, struct dst_entry **dstp,
 			unsigned int flags)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct ipv6hdr *iph;
 	struct sk_buff *skb;
 	int err;
+	struct rt6_info *rt = (struct rt6_info *)*dstp;
 
-	if (length > rt->u.dst.dev->mtu) {
-		ipv6_local_error(sk, EMSGSIZE, fl, rt->u.dst.dev->mtu);
+	if (length > rt->dst.dev->mtu) {
+		ipv6_local_error(sk, EMSGSIZE, fl, rt->dst.dev->mtu);
 		return -EMSGSIZE;
 	}
 	if (flags&MSG_PROBE)
 		goto out;
 
 	skb = sock_alloc_send_skb(sk,
-				  length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15,
+				  length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15,
 				  flags & MSG_DONTWAIT, &err);
 	if (skb == NULL)
 		goto error;
-	skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev));
+	skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev));
 
 	skb->priority = sk->sk_priority;
 	skb->mark = sk->sk_mark;
-	skb_dst_set(skb, dst_clone(&rt->u.dst));
+	skb_dst_set(skb, &rt->dst);
+	*dstp = NULL;
 
 	skb_put(skb, length);
 	skb_reset_network_header(skb);
@@ -641,7 +643,7 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
 
 	IP6_UPD_PO_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
 	err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
-		      rt->u.dst.dev, dst_output);
+		      rt->dst.dev, dst_output);
 	if (err > 0)
 		err = net_xmit_errno(err);
 	if (err)
@@ -725,7 +727,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 {
 	struct ipv6_txoptions opt_space;
 	struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name;
-	struct in6_addr *daddr, *final_p = NULL, final;
+	struct in6_addr *daddr, *final_p, final;
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct raw6_sock *rp = raw6_sk(sk);
@@ -762,7 +764,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 			return -EINVAL;
 
 		if (sin6->sin6_family && sin6->sin6_family != AF_INET6)
-			return(-EAFNOSUPPORT);
+			return -EAFNOSUPPORT;
 
 		/* port is the proto value [0..255] carried in nexthdr */
 		proto = ntohs(sin6->sin6_port);
@@ -770,10 +772,10 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 		if (!proto)
 			proto = inet->inet_num;
 		else if (proto != inet->inet_num)
-			return(-EINVAL);
+			return -EINVAL;
 
 		if (proto > 255)
-			return(-EINVAL);
+			return -EINVAL;
 
 		daddr = &sin6->sin6_addr;
 		if (np->sndflow) {
@@ -847,13 +849,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 	if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr))
 		ipv6_addr_copy(&fl.fl6_src, &np->saddr);
 
-	/* merge ip6_build_xmit from ip6_output */
-	if (opt && opt->srcrt) {
-		struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
-	}
+	final_p = fl6_update_dst(&fl, opt, &final);
 
 	if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst))
 		fl.oif = np->mcast_oif;
@@ -892,9 +888,9 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 		goto do_confirm;
 
 back_from_confirm:
-	if (inet->hdrincl) {
-		err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl, (struct rt6_info*)dst, msg->msg_flags);
-	} else {
+	if (inet->hdrincl)
+		err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl, &dst, msg->msg_flags);
+	else {
 		lock_sock(sk);
 		err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov,
 			len, 0, hlimit, tclass, opt, &fl, (struct rt6_info*)dst,
@@ -989,7 +985,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
 			/* You may get strange result with a positive odd offset;
 			   RFC2292bis agrees with me. */
 			if (val > 0 && (val&1))
-				return(-EINVAL);
+				return -EINVAL;
 			if (val < 0) {
 				rp->checksum = 0;
 			} else {
@@ -1001,7 +997,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
 			break;
 
 		default:
-			return(-ENOPROTOOPT);
+			return -ENOPROTOOPT;
 	}
 }
 
@@ -1194,7 +1190,7 @@ static int rawv6_init_sk(struct sock *sk)
 	default:
 		break;
 	}
-	return(0);
+	return 0;
 }
 
 struct proto rawv6_prot = {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 6d4292ff585..c7ba3149633 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -149,16 +149,6 @@ int ip6_frag_match(struct inet_frag_queue *q, void *a)
 }
 EXPORT_SYMBOL(ip6_frag_match);
 
-/* Memory Tracking Functions. */
-static inline void frag_kfree_skb(struct netns_frags *nf,
-		struct sk_buff *skb, int *work)
-{
-	if (work)
-		*work -= skb->truesize;
-	atomic_sub(skb->truesize, &nf->mem);
-	kfree_skb(skb);
-}
-
 void ip6_frag_init(struct inet_frag_queue *q, void *a)
 {
 	struct frag_queue *fq = container_of(q, struct frag_queue, q);
@@ -336,6 +326,11 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	 * in the chain of fragments so far.  We must know where to put
 	 * this fragment, right?
 	 */
+	prev = fq->q.fragments_tail;
+	if (!prev || FRAG6_CB(prev)->offset < offset) {
+		next = NULL;
+		goto found;
+	}
 	prev = NULL;
 	for(next = fq->q.fragments; next != NULL; next = next->next) {
 		if (FRAG6_CB(next)->offset >= offset)
@@ -343,63 +338,30 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 		prev = next;
 	}
 
-	/* We found where to put this one.  Check for overlap with
-	 * preceding fragment, and, if needed, align things so that
-	 * any overlaps are eliminated.
-	 */
-	if (prev) {
-		int i = (FRAG6_CB(prev)->offset + prev->len) - offset;
-
-		if (i > 0) {
-			offset += i;
-			if (end <= offset)
-				goto err;
-			if (!pskb_pull(skb, i))
-				goto err;
-			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
-				skb->ip_summed = CHECKSUM_NONE;
-		}
-	}
-
-	/* Look for overlap with succeeding segments.
-	 * If we can merge fragments, do it.
+found:
+	/* RFC5722, Section 4:
+	 *                                  When reassembling an IPv6 datagram, if
+	 *   one or more its constituent fragments is determined to be an
+	 *   overlapping fragment, the entire datagram (and any constituent
+	 *   fragments, including those not yet received) MUST be silently
+	 *   discarded.
 	 */
-	while (next && FRAG6_CB(next)->offset < end) {
-		int i = end - FRAG6_CB(next)->offset; /* overlap is 'i' bytes */
 
-		if (i < next->len) {
-			/* Eat head of the next overlapped fragment
-			 * and leave the loop. The next ones cannot overlap.
-			 */
-			if (!pskb_pull(next, i))
-				goto err;
-			FRAG6_CB(next)->offset += i;	/* next fragment */
-			fq->q.meat -= i;
-			if (next->ip_summed != CHECKSUM_UNNECESSARY)
-				next->ip_summed = CHECKSUM_NONE;
-			break;
-		} else {
-			struct sk_buff *free_it = next;
-
-			/* Old fragment is completely overridden with
-			 * new one drop it.
-			 */
-			next = next->next;
+	/* Check for overlap with preceding fragment. */
+	if (prev &&
+	    (FRAG6_CB(prev)->offset + prev->len) - offset > 0)
+		goto discard_fq;
 
-			if (prev)
-				prev->next = next;
-			else
-				fq->q.fragments = next;
-
-			fq->q.meat -= free_it->len;
-			frag_kfree_skb(fq->q.net, free_it, NULL);
-		}
-	}
+	/* Look for overlap with succeeding segment. */
+	if (next && FRAG6_CB(next)->offset < end)
+		goto discard_fq;
 
 	FRAG6_CB(skb)->offset = offset;
 
 	/* Insert this fragment in the chain of fragments. */
 	skb->next = next;
+	if (!next)
+		fq->q.fragments_tail = skb;
 	if (prev)
 		prev->next = skb;
 	else
@@ -431,6 +393,8 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	write_unlock(&ip6_frags.lock);
 	return -1;
 
+discard_fq:
+	fq_kill(fq);
 err:
 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 		      IPSTATS_MIB_REASMFAILS);
@@ -466,6 +430,8 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 			goto out_oom;
 
 		fp->next = head->next;
+		if (!fp->next)
+			fq->q.fragments_tail = fp;
 		prev->next = fp;
 
 		skb_morph(head, fq->q.fragments);
@@ -492,7 +458,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	/* If the first fragment is fragmented itself, we split
 	 * it to two chunks: the first with data and paged part
 	 * and the second, holding only fragments. */
-	if (skb_has_frags(head)) {
+	if (skb_has_frag_list(head)) {
 		struct sk_buff *clone;
 		int i, plen = 0;
 
@@ -524,7 +490,6 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	skb_shinfo(head)->frag_list = head->next;
 	skb_reset_transport_header(head);
 	skb_push(head, head->data - skb_network_header(head));
-	atomic_sub(head->truesize, &fq->q.net->mem);
 
 	for (fp=head->next; fp; fp = fp->next) {
 		head->data_len += fp->len;
@@ -534,8 +499,8 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 		else if (head->ip_summed == CHECKSUM_COMPLETE)
 			head->csum = csum_add(head->csum, fp->csum);
 		head->truesize += fp->truesize;
-		atomic_sub(fp->truesize, &fq->q.net->mem);
 	}
+	atomic_sub(head->truesize, &fq->q.net->mem);
 
 	head->next = NULL;
 	head->dev = dev;
@@ -553,6 +518,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
 	rcu_read_unlock();
 	fq->q.fragments = NULL;
+	fq->q.fragments_tail = NULL;
 	return 1;
 
 out_oversize:
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 252d76199c4..25661f968f3 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -109,7 +109,6 @@ static struct dst_ops ip6_dst_ops_template = {
 	.link_failure		=	ip6_link_failure,
 	.update_pmtu		=	ip6_rt_update_pmtu,
 	.local_out		=	__ip6_local_out,
-	.entries		=	ATOMIC_INIT(0),
 };
 
 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -122,20 +121,17 @@ static struct dst_ops ip6_dst_blackhole_ops = {
 	.destroy		=	ip6_dst_destroy,
 	.check			=	ip6_dst_check,
 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
-	.entries		=	ATOMIC_INIT(0),
 };
 
 static struct rt6_info ip6_null_entry_template = {
-	.u = {
-		.dst = {
-			.__refcnt	= ATOMIC_INIT(1),
-			.__use		= 1,
-			.obsolete	= -1,
-			.error		= -ENETUNREACH,
-			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
-			.input		= ip6_pkt_discard,
-			.output		= ip6_pkt_discard_out,
-		}
+	.dst = {
+		.__refcnt	= ATOMIC_INIT(1),
+		.__use		= 1,
+		.obsolete	= -1,
+		.error		= -ENETUNREACH,
+		.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
+		.input		= ip6_pkt_discard,
+		.output		= ip6_pkt_discard_out,
 	},
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 	.rt6i_protocol  = RTPROT_KERNEL,
@@ -149,16 +145,14 @@ static int ip6_pkt_prohibit(struct sk_buff *skb);
 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
 
 static struct rt6_info ip6_prohibit_entry_template = {
-	.u = {
-		.dst = {
-			.__refcnt	= ATOMIC_INIT(1),
-			.__use		= 1,
-			.obsolete	= -1,
-			.error		= -EACCES,
-			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
-			.input		= ip6_pkt_prohibit,
-			.output		= ip6_pkt_prohibit_out,
-		}
+	.dst = {
+		.__refcnt	= ATOMIC_INIT(1),
+		.__use		= 1,
+		.obsolete	= -1,
+		.error		= -EACCES,
+		.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
+		.input		= ip6_pkt_prohibit,
+		.output		= ip6_pkt_prohibit_out,
 	},
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 	.rt6i_protocol  = RTPROT_KERNEL,
@@ -167,16 +161,14 @@ static struct rt6_info ip6_prohibit_entry_template = {
 };
 
 static struct rt6_info ip6_blk_hole_entry_template = {
-	.u = {
-		.dst = {
-			.__refcnt	= ATOMIC_INIT(1),
-			.__use		= 1,
-			.obsolete	= -1,
-			.error		= -EINVAL,
-			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
-			.input		= dst_discard,
-			.output		= dst_discard,
-		}
+	.dst = {
+		.__refcnt	= ATOMIC_INIT(1),
+		.__use		= 1,
+		.obsolete	= -1,
+		.error		= -EINVAL,
+		.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
+		.input		= dst_discard,
+		.output		= dst_discard,
 	},
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 	.rt6i_protocol  = RTPROT_KERNEL,
@@ -223,14 +215,14 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 
 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
 {
-	return (rt->rt6i_flags & RTF_EXPIRES &&
-		time_after(jiffies, rt->rt6i_expires));
+	return (rt->rt6i_flags & RTF_EXPIRES) &&
+		time_after(jiffies, rt->rt6i_expires);
 }
 
 static inline int rt6_need_strict(struct in6_addr *daddr)
 {
-	return (ipv6_addr_type(daddr) &
-		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK));
+	return ipv6_addr_type(daddr) &
+		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
 /*
@@ -249,7 +241,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
 	if (!oif && ipv6_addr_any(saddr))
 		goto out;
 
-	for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
+	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
 		struct net_device *dev = sprt->rt6i_dev;
 
 		if (oif) {
@@ -407,10 +399,10 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 
 	match = NULL;
 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
-	     rt = rt->u.dst.rt6_next)
+	     rt = rt->dst.rt6_next)
 		match = find_match(rt, oif, strict, &mpri, match);
 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
-	     rt = rt->u.dst.rt6_next)
+	     rt = rt->dst.rt6_next)
 		match = find_match(rt, oif, strict, &mpri, match);
 
 	return match;
@@ -432,7 +424,7 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
 
 	if (!match &&
 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
-		struct rt6_info *next = rt0->u.dst.rt6_next;
+		struct rt6_info *next = rt0->dst.rt6_next;
 
 		/* no entries matched; do round-robin */
 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
@@ -446,7 +438,7 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
 		  __func__, match);
 
 	net = dev_net(rt0->rt6i_dev);
-	return (match ? match : net->ipv6.ip6_null_entry);
+	return match ? match : net->ipv6.ip6_null_entry;
 }
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
@@ -517,7 +509,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 			rt->rt6i_expires = jiffies + HZ * lifetime;
 			rt->rt6i_flags |= RTF_EXPIRES;
 		}
-		dst_release(&rt->u.dst);
+		dst_release(&rt->dst);
 	}
 	return 0;
 }
@@ -555,7 +547,7 @@ restart:
 	rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
 	BACKTRACK(net, &fl->fl6_src);
 out:
-	dst_use(&rt->u.dst, jiffies);
+	dst_use(&rt->dst, jiffies);
 	read_unlock_bh(&table->tb6_lock);
 	return rt;
 
@@ -643,7 +635,7 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *dad
 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
 		rt->rt6i_dst.plen = 128;
 		rt->rt6i_flags |= RTF_CACHE;
-		rt->u.dst.flags |= DST_HOST;
+		rt->dst.flags |= DST_HOST;
 
 #ifdef CONFIG_IPV6_SUBTREES
 		if (rt->rt6i_src.plen && saddr) {
@@ -676,8 +668,8 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *dad
 
 			if (net_ratelimit())
 				printk(KERN_WARNING
-				       "Neighbour table overflow.\n");
-			dst_free(&rt->u.dst);
+				       "ipv6: Neighbour table overflow.\n");
+			dst_free(&rt->dst);
 			return NULL;
 		}
 		rt->rt6i_nexthop = neigh;
@@ -694,7 +686,7 @@ static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *d
 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
 		rt->rt6i_dst.plen = 128;
 		rt->rt6i_flags |= RTF_CACHE;
-		rt->u.dst.flags |= DST_HOST;
+		rt->dst.flags |= DST_HOST;
 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
 	}
 	return rt;
@@ -726,7 +718,7 @@ restart:
 	    rt->rt6i_flags & RTF_CACHE)
 		goto out;
 
-	dst_hold(&rt->u.dst);
+	dst_hold(&rt->dst);
 	read_unlock_bh(&table->tb6_lock);
 
 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
@@ -739,10 +731,10 @@ restart:
 #endif
 	}
 
-	dst_release(&rt->u.dst);
+	dst_release(&rt->dst);
 	rt = nrt ? : net->ipv6.ip6_null_entry;
 
-	dst_hold(&rt->u.dst);
+	dst_hold(&rt->dst);
 	if (nrt) {
 		err = ip6_ins_rt(nrt);
 		if (!err)
@@ -756,7 +748,7 @@ restart:
 	 * Race condition! In the gap, when table->tb6_lock was
 	 * released someone could insert this route.  Relookup.
 	 */
-	dst_release(&rt->u.dst);
+	dst_release(&rt->dst);
 	goto relookup;
 
 out:
@@ -764,11 +756,11 @@ out:
 		reachable = 0;
 		goto restart_2;
 	}
-	dst_hold(&rt->u.dst);
+	dst_hold(&rt->dst);
 	read_unlock_bh(&table->tb6_lock);
 out2:
-	rt->u.dst.lastuse = jiffies;
-	rt->u.dst.__use++;
+	rt->dst.lastuse = jiffies;
+	rt->dst.__use++;
 
 	return rt;
 }
@@ -835,15 +827,15 @@ int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl
 	struct dst_entry *new = NULL;
 
 	if (rt) {
-		new = &rt->u.dst;
+		new = &rt->dst;
 
 		atomic_set(&new->__refcnt, 1);
 		new->__use = 1;
 		new->input = dst_discard;
 		new->output = dst_discard;
 
-		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
-		new->dev = ort->u.dst.dev;
+		memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
+		new->dev = ort->dst.dev;
 		if (new->dev)
 			dev_hold(new->dev);
 		rt->rt6i_idev = ort->rt6i_idev;
@@ -865,7 +857,7 @@ int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl
 
 	dst_release(*dstp);
 	*dstp = new;
-	return (new ? 0 : -ENOMEM);
+	return new ? 0 : -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
 
@@ -912,7 +904,7 @@ static void ip6_link_failure(struct sk_buff *skb)
 	rt = (struct rt6_info *) skb_dst(skb);
 	if (rt) {
 		if (rt->rt6i_flags&RTF_CACHE) {
-			dst_set_expires(&rt->u.dst, 0);
+			dst_set_expires(&rt->dst, 0);
 			rt->rt6i_flags |= RTF_EXPIRES;
 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
 			rt->rt6i_node->fn_sernum = -1;
@@ -986,14 +978,14 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
 	rt->rt6i_dev	  = dev;
 	rt->rt6i_idev     = idev;
 	rt->rt6i_nexthop  = neigh;
-	atomic_set(&rt->u.dst.__refcnt, 1);
-	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
-	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
-	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
-	rt->u.dst.output  = ip6_output;
+	atomic_set(&rt->dst.__refcnt, 1);
+	rt->dst.metrics[RTAX_HOPLIMIT-1] = 255;
+	rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
+	rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
+	rt->dst.output  = ip6_output;
 
 #if 0	/* there's no chance to use these for ndisc */
-	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
+	rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
 				? DST_HOST
 				: 0;
 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
@@ -1001,14 +993,14 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
 #endif
 
 	spin_lock_bh(&icmp6_dst_lock);
-	rt->u.dst.next = icmp6_dst_gc_list;
-	icmp6_dst_gc_list = &rt->u.dst;
+	rt->dst.next = icmp6_dst_gc_list;
+	icmp6_dst_gc_list = &rt->dst;
 	spin_unlock_bh(&icmp6_dst_lock);
 
 	fib6_force_start_gc(net);
 
 out:
-	return &rt->u.dst;
+	return &rt->dst;
 }
 
 int icmp6_dst_gc(void)
@@ -1064,19 +1056,22 @@ static int ip6_dst_gc(struct dst_ops *ops)
 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
+	int entries;
 
+	entries = dst_entries_get_fast(ops);
 	if (time_after(rt_last_gc + rt_min_interval, now) &&
-	    atomic_read(&ops->entries) <= rt_max_size)
+	    entries <= rt_max_size)
 		goto out;
 
 	net->ipv6.ip6_rt_gc_expire++;
 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
 	net->ipv6.ip6_rt_last_gc = now;
-	if (atomic_read(&ops->entries) < ops->gc_thresh)
+	entries = dst_entries_get_slow(ops);
+	if (entries < ops->gc_thresh)
 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
 out:
 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
-	return (atomic_read(&ops->entries) > rt_max_size);
+	return entries > rt_max_size;
 }
 
 /* Clean host part of a prefix. Not necessary in radix tree,
@@ -1090,11 +1085,11 @@ static int ipv6_get_mtu(struct net_device *dev)
 	int mtu = IPV6_MIN_MTU;
 	struct inet6_dev *idev;
 
-	idev = in6_dev_get(dev);
-	if (idev) {
+	rcu_read_lock();
+	idev = __in6_dev_get(dev);
+	if (idev)
 		mtu = idev->cnf.mtu6;
-		in6_dev_put(idev);
-	}
+	rcu_read_unlock();
 	return mtu;
 }
 
@@ -1103,12 +1098,15 @@ int ip6_dst_hoplimit(struct dst_entry *dst)
 	int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
 	if (hoplimit < 0) {
 		struct net_device *dev = dst->dev;
-		struct inet6_dev *idev = in6_dev_get(dev);
-		if (idev) {
+		struct inet6_dev *idev;
+
+		rcu_read_lock();
+		idev = __in6_dev_get(dev);
+		if (idev)
 			hoplimit = idev->cnf.hop_limit;
-			in6_dev_put(idev);
-		} else
+		else
 			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
+		rcu_read_unlock();
 	}
 	return hoplimit;
 }
@@ -1159,7 +1157,7 @@ int ip6_route_add(struct fib6_config *cfg)
 		goto out;
 	}
 
-	rt->u.dst.obsolete = -1;
+	rt->dst.obsolete = -1;
 	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
 				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
 				0;
@@ -1171,16 +1169,18 @@ int ip6_route_add(struct fib6_config *cfg)
 	addr_type = ipv6_addr_type(&cfg->fc_dst);
 
 	if (addr_type & IPV6_ADDR_MULTICAST)
-		rt->u.dst.input = ip6_mc_input;
+		rt->dst.input = ip6_mc_input;
+	else if (cfg->fc_flags & RTF_LOCAL)
+		rt->dst.input = ip6_input;
 	else
-		rt->u.dst.input = ip6_forward;
+		rt->dst.input = ip6_forward;
 
-	rt->u.dst.output = ip6_output;
+	rt->dst.output = ip6_output;
 
 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
 	rt->rt6i_dst.plen = cfg->fc_dst_len;
 	if (rt->rt6i_dst.plen == 128)
-	       rt->u.dst.flags = DST_HOST;
+	       rt->dst.flags = DST_HOST;
 
 #ifdef CONFIG_IPV6_SUBTREES
 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
@@ -1193,7 +1193,8 @@ int ip6_route_add(struct fib6_config *cfg)
 	   they would result in kernel looping; promote them to reject routes
 	 */
 	if ((cfg->fc_flags & RTF_REJECT) ||
-	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
+	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
+					      && !(cfg->fc_flags&RTF_LOCAL))) {
 		/* hold loopback dev/idev if we haven't done so. */
 		if (dev != net->loopback_dev) {
 			if (dev) {
@@ -1208,9 +1209,9 @@ int ip6_route_add(struct fib6_config *cfg)
 				goto out;
 			}
 		}
-		rt->u.dst.output = ip6_pkt_discard_out;
-		rt->u.dst.input = ip6_pkt_discard;
-		rt->u.dst.error = -ENETUNREACH;
+		rt->dst.output = ip6_pkt_discard_out;
+		rt->dst.input = ip6_pkt_discard;
+		rt->dst.error = -ENETUNREACH;
 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
 		goto install_route;
 	}
@@ -1244,7 +1245,7 @@ int ip6_route_add(struct fib6_config *cfg)
 				goto out;
 			if (dev) {
 				if (dev != grt->rt6i_dev) {
-					dst_release(&grt->u.dst);
+					dst_release(&grt->dst);
 					goto out;
 				}
 			} else {
@@ -1255,7 +1256,7 @@ int ip6_route_add(struct fib6_config *cfg)
 			}
 			if (!(grt->rt6i_flags&RTF_GATEWAY))
 				err = 0;
-			dst_release(&grt->u.dst);
+			dst_release(&grt->dst);
 
 			if (err)
 				goto out;
@@ -1294,18 +1295,18 @@ install_route:
 					goto out;
 				}
 
-				rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
+				rt->dst.metrics[type - 1] = nla_get_u32(nla);
 			}
 		}
 	}
 
-	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
-		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
-	if (!dst_mtu(&rt->u.dst))
-		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
-	if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
-		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
-	rt->u.dst.dev = dev;
+	if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
+		rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
+	if (!dst_mtu(&rt->dst))
+		rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
+	if (!dst_metric(&rt->dst, RTAX_ADVMSS))
+		rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
+	rt->dst.dev = dev;
 	rt->rt6i_idev = idev;
 	rt->rt6i_table = table;
 
@@ -1319,7 +1320,7 @@ out:
 	if (idev)
 		in6_dev_put(idev);
 	if (rt)
-		dst_free(&rt->u.dst);
+		dst_free(&rt->dst);
 	return err;
 }
 
@@ -1336,7 +1337,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
 	write_lock_bh(&table->tb6_lock);
 
 	err = fib6_del(rt, info);
-	dst_release(&rt->u.dst);
+	dst_release(&rt->dst);
 
 	write_unlock_bh(&table->tb6_lock);
 
@@ -1369,7 +1370,7 @@ static int ip6_route_del(struct fib6_config *cfg)
 			 &cfg->fc_src, cfg->fc_src_len);
 
 	if (fn) {
-		for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
+		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
 			if (cfg->fc_ifindex &&
 			    (rt->rt6i_dev == NULL ||
 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
@@ -1379,7 +1380,7 @@ static int ip6_route_del(struct fib6_config *cfg)
 				continue;
 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
 				continue;
-			dst_hold(&rt->u.dst);
+			dst_hold(&rt->dst);
 			read_unlock_bh(&table->tb6_lock);
 
 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
@@ -1421,7 +1422,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 	read_lock_bh(&table->tb6_lock);
 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
 restart:
-	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
+	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
 		/*
 		 * Current route is on-link; redirect is always invalid.
 		 *
@@ -1445,7 +1446,7 @@ restart:
 		rt = net->ipv6.ip6_null_entry;
 	BACKTRACK(net, &fl->fl6_src);
 out:
-	dst_hold(&rt->u.dst);
+	dst_hold(&rt->dst);
 
 	read_unlock_bh(&table->tb6_lock);
 
@@ -1513,10 +1514,10 @@ void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
 	 * Look, redirects are sent only in response to data packets,
 	 * so that this nexthop apparently is reachable. --ANK
 	 */
-	dst_confirm(&rt->u.dst);
+	dst_confirm(&rt->dst);
 
 	/* Duplicate redirect: silently ignore. */
-	if (neigh == rt->u.dst.neighbour)
+	if (neigh == rt->dst.neighbour)
 		goto out;
 
 	nrt = ip6_rt_copy(rt);
@@ -1529,20 +1530,20 @@ void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
 
 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
 	nrt->rt6i_dst.plen = 128;
-	nrt->u.dst.flags |= DST_HOST;
+	nrt->dst.flags |= DST_HOST;
 
 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
 	nrt->rt6i_nexthop = neigh_clone(neigh);
 	/* Reset pmtu, it may be better */
-	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
-	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
-							dst_mtu(&nrt->u.dst));
+	nrt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
+	nrt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
+							dst_mtu(&nrt->dst));
 
 	if (ip6_ins_rt(nrt))
 		goto out;
 
-	netevent.old = &rt->u.dst;
-	netevent.new = &nrt->u.dst;
+	netevent.old = &rt->dst;
+	netevent.new = &nrt->dst;
 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
 
 	if (rt->rt6i_flags&RTF_CACHE) {
@@ -1551,7 +1552,7 @@ void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
 	}
 
 out:
-	dst_release(&rt->u.dst);
+	dst_release(&rt->dst);
 }
 
 /*
@@ -1559,18 +1560,17 @@ out:
  *	i.e. Path MTU discovery
  */
 
-void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
-			struct net_device *dev, u32 pmtu)
+static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
+			     struct net *net, u32 pmtu, int ifindex)
 {
 	struct rt6_info *rt, *nrt;
-	struct net *net = dev_net(dev);
 	int allfrag = 0;
 
-	rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
+	rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
 	if (rt == NULL)
 		return;
 
-	if (pmtu >= dst_mtu(&rt->u.dst))
+	if (pmtu >= dst_mtu(&rt->dst))
 		goto out;
 
 	if (pmtu < IPV6_MIN_MTU) {
@@ -1588,7 +1588,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
 	   They are sent only in response to data packets,
 	   so that this nexthop apparently is reachable. --ANK
 	 */
-	dst_confirm(&rt->u.dst);
+	dst_confirm(&rt->dst);
 
 	/* Host route. If it is static, it would be better
 	   not to override it, but add new one, so that
@@ -1596,10 +1596,10 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
 	   would return automatically.
 	 */
 	if (rt->rt6i_flags & RTF_CACHE) {
-		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
+		rt->dst.metrics[RTAX_MTU-1] = pmtu;
 		if (allfrag)
-			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
-		dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
+			rt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
+		dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
 		goto out;
 	}
@@ -1615,9 +1615,9 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
 		nrt = rt6_alloc_clone(rt, daddr);
 
 	if (nrt) {
-		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
+		nrt->dst.metrics[RTAX_MTU-1] = pmtu;
 		if (allfrag)
-			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
+			nrt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
 
 		/* According to RFC 1981, detecting PMTU increase shouldn't be
 		 * happened within 5 mins, the recommended timer is 10 mins.
@@ -1625,13 +1625,34 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
 		 * and detecting PMTU increase will be automatically happened.
 		 */
-		dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
+		dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
 
 		ip6_ins_rt(nrt);
 	}
 out:
-	dst_release(&rt->u.dst);
+	dst_release(&rt->dst);
+}
+
+void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
+			struct net_device *dev, u32 pmtu)
+{
+	struct net *net = dev_net(dev);
+
+	/*
+	 * RFC 1981 states that a node "MUST reduce the size of the packets it
+	 * is sending along the path" that caused the Packet Too Big message.
+	 * Since it's not possible in the general case to determine which
+	 * interface was used to send the original packet, we update the MTU
+	 * on the interface that will be used to send future packets. We also
+	 * update the MTU on the interface that received the Packet Too Big in
+	 * case the original packet was forced out that interface with
+	 * SO_BINDTODEVICE or similar. This is the next best thing to the
+	 * correct behaviour, which would be to update the MTU on all
+	 * interfaces.
+	 */
+	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
+	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
 }
 
 /*
@@ -1644,18 +1665,18 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
 
 	if (rt) {
-		rt->u.dst.input = ort->u.dst.input;
-		rt->u.dst.output = ort->u.dst.output;
-
-		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
-		rt->u.dst.error = ort->u.dst.error;
-		rt->u.dst.dev = ort->u.dst.dev;
-		if (rt->u.dst.dev)
-			dev_hold(rt->u.dst.dev);
+		rt->dst.input = ort->dst.input;
+		rt->dst.output = ort->dst.output;
+
+		memcpy(rt->dst.metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
+		rt->dst.error = ort->dst.error;
+		rt->dst.dev = ort->dst.dev;
+		if (rt->dst.dev)
+			dev_hold(rt->dst.dev);
 		rt->rt6i_idev = ort->rt6i_idev;
 		if (rt->rt6i_idev)
 			in6_dev_hold(rt->rt6i_idev);
-		rt->u.dst.lastuse = jiffies;
+		rt->dst.lastuse = jiffies;
 		rt->rt6i_expires = 0;
 
 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
@@ -1689,14 +1710,14 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
 	if (!fn)
 		goto out;
 
-	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
+	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
 		if (rt->rt6i_dev->ifindex != ifindex)
 			continue;
 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
 			continue;
 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
 			continue;
-		dst_hold(&rt->u.dst);
+		dst_hold(&rt->dst);
 		break;
 	}
 out:
@@ -1744,14 +1765,14 @@ struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *d
 		return NULL;
 
 	write_lock_bh(&table->tb6_lock);
-	for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
+	for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
 		if (dev == rt->rt6i_dev &&
 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
 			break;
 	}
 	if (rt)
-		dst_hold(&rt->u.dst);
+		dst_hold(&rt->dst);
 	write_unlock_bh(&table->tb6_lock);
 	return rt;
 }
@@ -1790,9 +1811,9 @@ void rt6_purge_dflt_routers(struct net *net)
 
 restart:
 	read_lock_bh(&table->tb6_lock);
-	for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
+	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
-			dst_hold(&rt->u.dst);
+			dst_hold(&rt->dst);
 			read_unlock_bh(&table->tb6_lock);
 			ip6_del_rt(rt);
 			goto restart;
@@ -1930,15 +1951,15 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 	dev_hold(net->loopback_dev);
 	in6_dev_hold(idev);
 
-	rt->u.dst.flags = DST_HOST;
-	rt->u.dst.input = ip6_input;
-	rt->u.dst.output = ip6_output;
+	rt->dst.flags = DST_HOST;
+	rt->dst.input = ip6_input;
+	rt->dst.output = ip6_output;
 	rt->rt6i_dev = net->loopback_dev;
 	rt->rt6i_idev = idev;
-	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
-	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
-	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
-	rt->u.dst.obsolete = -1;
+	rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
+	rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
+	rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
+	rt->dst.obsolete = -1;
 
 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
 	if (anycast)
@@ -1947,7 +1968,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 		rt->rt6i_flags |= RTF_LOCAL;
 	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
 	if (IS_ERR(neigh)) {
-		dst_free(&rt->u.dst);
+		dst_free(&rt->dst);
 
 		/* We are casting this because that is the return
 		 * value type.  But an errno encoded pointer is the
@@ -1962,7 +1983,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 	rt->rt6i_dst.plen = 128;
 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
 
-	atomic_set(&rt->u.dst.__refcnt, 1);
+	atomic_set(&rt->dst.__refcnt, 1);
 
 	return rt;
 }
@@ -2033,12 +2054,12 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
 	   PMTU discouvery.
 	 */
 	if (rt->rt6i_dev == arg->dev &&
-	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
-	    (dst_mtu(&rt->u.dst) >= arg->mtu ||
-	     (dst_mtu(&rt->u.dst) < arg->mtu &&
-	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
-		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
-		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
+	    !dst_metric_locked(&rt->dst, RTAX_MTU) &&
+	    (dst_mtu(&rt->dst) >= arg->mtu ||
+	     (dst_mtu(&rt->dst) < arg->mtu &&
+	      dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
+		rt->dst.metrics[RTAX_MTU-1] = arg->mtu;
+		rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
 	}
 	return 0;
 }
@@ -2085,6 +2106,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (rtm->rtm_type == RTN_UNREACHABLE)
 		cfg->fc_flags |= RTF_REJECT;
 
+	if (rtm->rtm_type == RTN_LOCAL)
+		cfg->fc_flags |= RTF_LOCAL;
+
 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
 	cfg->fc_nlinfo.nlh = nlh;
 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
@@ -2205,6 +2229,8 @@ static int rt6_fill_node(struct net *net,
 	NLA_PUT_U32(skb, RTA_TABLE, table);
 	if (rt->rt6i_flags&RTF_REJECT)
 		rtm->rtm_type = RTN_UNREACHABLE;
+	else if (rt->rt6i_flags&RTF_LOCAL)
+		rtm->rtm_type = RTN_LOCAL;
 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
 		rtm->rtm_type = RTN_LOCAL;
 	else
@@ -2252,20 +2278,20 @@ static int rt6_fill_node(struct net *net,
 #endif
 			NLA_PUT_U32(skb, RTA_IIF, iif);
 	} else if (dst) {
-		struct inet6_dev *idev = ip6_dst_idev(&rt->u.dst);
+		struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
 		struct in6_addr saddr_buf;
 		if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
 				       dst, 0, &saddr_buf) == 0)
 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
 	}
 
-	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
+	if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
 		goto nla_put_failure;
 
-	if (rt->u.dst.neighbour)
-		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
+	if (rt->dst.neighbour)
+		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
 
-	if (rt->u.dst.dev)
+	if (rt->dst.dev)
 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
 
 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
@@ -2277,8 +2303,8 @@ static int rt6_fill_node(struct net *net,
 	else
 		expires = INT_MAX;
 
-	if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
-			       expires, rt->u.dst.error) < 0)
+	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
+			       expires, rt->dst.error) < 0)
 		goto nla_put_failure;
 
 	return nlmsg_end(skb, nlh);
@@ -2364,7 +2390,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
 
 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 
 	err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
@@ -2416,12 +2442,12 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 	struct net *net = dev_net(dev);
 
 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
-		net->ipv6.ip6_null_entry->u.dst.dev = dev;
+		net->ipv6.ip6_null_entry->dst.dev = dev;
 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
-		net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
+		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
-		net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
+		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
 #endif
 	}
@@ -2464,8 +2490,8 @@ static int rt6_info_route(struct rt6_info *rt, void *p_arg)
 		seq_puts(m, "00000000000000000000000000000000");
 	}
 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
-		   rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
-		   rt->u.dst.__use, rt->rt6i_flags,
+		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
+		   rt->dst.__use, rt->rt6i_flags,
 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
 	return 0;
 }
@@ -2499,7 +2525,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v)
 		   net->ipv6.rt6_stats->fib_rt_alloc,
 		   net->ipv6.rt6_stats->fib_rt_entries,
 		   net->ipv6.rt6_stats->fib_rt_cache,
-		   atomic_read(&net->ipv6.ip6_dst_ops.entries),
+		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
 		   net->ipv6.rt6_stats->fib_discarded_routes);
 
 	return 0;
@@ -2583,7 +2609,7 @@ ctl_table ipv6_route_table_template[] = {
 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
-		.proc_handler	=	proc_dointvec_jiffies,
+		.proc_handler	=	proc_dointvec,
 	},
 	{
 		.procname	=	"mtu_expires",
@@ -2597,7 +2623,7 @@ ctl_table ipv6_route_table_template[] = {
 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
-		.proc_handler	=	proc_dointvec_jiffies,
+		.proc_handler	=	proc_dointvec,
 	},
 	{
 		.procname	=	"gc_min_interval_ms",
@@ -2641,14 +2667,17 @@ static int __net_init ip6_route_net_init(struct net *net)
 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
 	       sizeof(net->ipv6.ip6_dst_ops));
 
+	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
+		goto out_ip6_dst_ops;
+
 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
 					   sizeof(*net->ipv6.ip6_null_entry),
 					   GFP_KERNEL);
 	if (!net->ipv6.ip6_null_entry)
-		goto out_ip6_dst_ops;
-	net->ipv6.ip6_null_entry->u.dst.path =
+		goto out_ip6_dst_entries;
+	net->ipv6.ip6_null_entry->dst.path =
 		(struct dst_entry *)net->ipv6.ip6_null_entry;
-	net->ipv6.ip6_null_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
+	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
@@ -2656,18 +2685,18 @@ static int __net_init ip6_route_net_init(struct net *net)
 					       GFP_KERNEL);
 	if (!net->ipv6.ip6_prohibit_entry)
 		goto out_ip6_null_entry;
-	net->ipv6.ip6_prohibit_entry->u.dst.path =
+	net->ipv6.ip6_prohibit_entry->dst.path =
 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
-	net->ipv6.ip6_prohibit_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
+	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 
 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
 					       GFP_KERNEL);
 	if (!net->ipv6.ip6_blk_hole_entry)
 		goto out_ip6_prohibit_entry;
-	net->ipv6.ip6_blk_hole_entry->u.dst.path =
+	net->ipv6.ip6_blk_hole_entry->dst.path =
 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
-	net->ipv6.ip6_blk_hole_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
+	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 #endif
 
 	net->ipv6.sysctl.flush_delay = 0;
@@ -2695,6 +2724,8 @@ out_ip6_prohibit_entry:
 out_ip6_null_entry:
 	kfree(net->ipv6.ip6_null_entry);
 #endif
+out_ip6_dst_entries:
+	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
 out_ip6_dst_ops:
 	goto out;
 }
@@ -2733,21 +2764,25 @@ int __init ip6_route_init(void)
 	if (!ip6_dst_ops_template.kmem_cachep)
 		goto out;
 
-	ret = register_pernet_subsys(&ip6_route_net_ops);
+	ret = dst_entries_init(&ip6_dst_blackhole_ops);
 	if (ret)
 		goto out_kmem_cache;
 
+	ret = register_pernet_subsys(&ip6_route_net_ops);
+	if (ret)
+		goto out_dst_entries;
+
 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
 
 	/* Registering of the loopback is done before this portion of code,
 	 * the loopback reference in rt6_info will not be taken, do it
 	 * manually for init_net */
-	init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
+	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
-	init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
+	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
-	init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
+	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
   #endif
 	ret = fib6_init();
@@ -2783,6 +2818,8 @@ out_fib6_init:
 	fib6_gc_cleanup();
 out_register_subsys:
 	unregister_pernet_subsys(&ip6_route_net_ops);
+out_dst_entries:
+	dst_entries_destroy(&ip6_dst_blackhole_ops);
 out_kmem_cache:
 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
 	goto out;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index e51e650ea80..d6bfaec3bbb 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -63,36 +63,63 @@
 #define HASH_SIZE  16
 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
 
-static void ipip6_tunnel_init(struct net_device *dev);
+static int ipip6_tunnel_init(struct net_device *dev);
 static void ipip6_tunnel_setup(struct net_device *dev);
+static void ipip6_dev_free(struct net_device *dev);
 
 static int sit_net_id __read_mostly;
 struct sit_net {
-	struct ip_tunnel *tunnels_r_l[HASH_SIZE];
-	struct ip_tunnel *tunnels_r[HASH_SIZE];
-	struct ip_tunnel *tunnels_l[HASH_SIZE];
-	struct ip_tunnel *tunnels_wc[1];
-	struct ip_tunnel **tunnels[4];
+	struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_wc[1];
+	struct ip_tunnel __rcu **tunnels[4];
 
 	struct net_device *fb_tunnel_dev;
 };
 
 /*
- * Locking : hash tables are protected by RCU and a spinlock
+ * Locking : hash tables are protected by RCU and RTNL
  */
-static DEFINE_SPINLOCK(ipip6_lock);
 
 #define for_each_ip_tunnel_rcu(start) \
 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
 
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+	unsigned long	rx_packets;
+	unsigned long	rx_bytes;
+	unsigned long	tx_packets;
+	unsigned long	tx_bytes;
+};
+
+static struct net_device_stats *ipip6_get_stats(struct net_device *dev)
+{
+	struct pcpu_tstats sum = { 0 };
+	int i;
+
+	for_each_possible_cpu(i) {
+		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+		sum.rx_packets += tstats->rx_packets;
+		sum.rx_bytes   += tstats->rx_bytes;
+		sum.tx_packets += tstats->tx_packets;
+		sum.tx_bytes   += tstats->tx_bytes;
+	}
+	dev->stats.rx_packets = sum.rx_packets;
+	dev->stats.rx_bytes   = sum.rx_bytes;
+	dev->stats.tx_packets = sum.tx_packets;
+	dev->stats.tx_bytes   = sum.tx_bytes;
+	return &dev->stats;
+}
 /*
  * Must be invoked with rcu_read_lock
  */
 static struct ip_tunnel * ipip6_tunnel_lookup(struct net *net,
 		struct net_device *dev, __be32 remote, __be32 local)
 {
-	unsigned h0 = HASH(remote);
-	unsigned h1 = HASH(local);
+	unsigned int h0 = HASH(remote);
+	unsigned int h1 = HASH(local);
 	struct ip_tunnel *t;
 	struct sit_net *sitn = net_generic(net, sit_net_id);
 
@@ -121,12 +148,12 @@ static struct ip_tunnel * ipip6_tunnel_lookup(struct net *net,
 	return NULL;
 }
 
-static struct ip_tunnel **__ipip6_bucket(struct sit_net *sitn,
+static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn,
 		struct ip_tunnel_parm *parms)
 {
 	__be32 remote = parms->iph.daddr;
 	__be32 local = parms->iph.saddr;
-	unsigned h = 0;
+	unsigned int h = 0;
 	int prio = 0;
 
 	if (remote) {
@@ -140,7 +167,7 @@ static struct ip_tunnel **__ipip6_bucket(struct sit_net *sitn,
 	return &sitn->tunnels[prio][h];
 }
 
-static inline struct ip_tunnel **ipip6_bucket(struct sit_net *sitn,
+static inline struct ip_tunnel __rcu **ipip6_bucket(struct sit_net *sitn,
 		struct ip_tunnel *t)
 {
 	return __ipip6_bucket(sitn, &t->parms);
@@ -148,13 +175,14 @@ static inline struct ip_tunnel **ipip6_bucket(struct sit_net *sitn,
 
 static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t)
 {
-	struct ip_tunnel **tp;
-
-	for (tp = ipip6_bucket(sitn, t); *tp; tp = &(*tp)->next) {
-		if (t == *tp) {
-			spin_lock_bh(&ipip6_lock);
-			*tp = t->next;
-			spin_unlock_bh(&ipip6_lock);
+	struct ip_tunnel __rcu **tp;
+	struct ip_tunnel *iter;
+
+	for (tp = ipip6_bucket(sitn, t);
+	     (iter = rtnl_dereference(*tp)) != NULL;
+	     tp = &iter->next) {
+		if (t == iter) {
+			rcu_assign_pointer(*tp, t->next);
 			break;
 		}
 	}
@@ -162,12 +190,10 @@ static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t)
 
 static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t)
 {
-	struct ip_tunnel **tp = ipip6_bucket(sitn, t);
+	struct ip_tunnel __rcu **tp = ipip6_bucket(sitn, t);
 
-	spin_lock_bh(&ipip6_lock);
-	t->next = *tp;
+	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
 	rcu_assign_pointer(*tp, t);
-	spin_unlock_bh(&ipip6_lock);
 }
 
 static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn)
@@ -187,17 +213,20 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn)
 #endif
 }
 
-static struct ip_tunnel * ipip6_tunnel_locate(struct net *net,
+static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
 		struct ip_tunnel_parm *parms, int create)
 {
 	__be32 remote = parms->iph.daddr;
 	__be32 local = parms->iph.saddr;
-	struct ip_tunnel *t, **tp, *nt;
+	struct ip_tunnel *t, *nt;
+	struct ip_tunnel __rcu **tp;
 	struct net_device *dev;
 	char name[IFNAMSIZ];
 	struct sit_net *sitn = net_generic(net, sit_net_id);
 
-	for (tp = __ipip6_bucket(sitn, parms); (t = *tp) != NULL; tp = &t->next) {
+	for (tp = __ipip6_bucket(sitn, parms);
+	    (t = rtnl_dereference(*tp)) != NULL;
+	     tp = &t->next) {
 		if (local == t->parms.iph.saddr &&
 		    remote == t->parms.iph.daddr &&
 		    parms->link == t->parms.link) {
@@ -213,7 +242,7 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct net *net,
 	if (parms->name[0])
 		strlcpy(name, parms->name, IFNAMSIZ);
 	else
-		sprintf(name, "sit%%d");
+		strcpy(name, "sit%d");
 
 	dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup);
 	if (dev == NULL)
@@ -229,7 +258,8 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct net *net,
 	nt = netdev_priv(dev);
 
 	nt->parms = *parms;
-	ipip6_tunnel_init(dev);
+	if (ipip6_tunnel_init(dev) < 0)
+		goto failed_free;
 	ipip6_tunnel_clone_6rd(dev, sitn);
 
 	if (parms->i_flags & SIT_ISATAP)
@@ -244,13 +274,11 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct net *net,
 	return nt;
 
 failed_free:
-	free_netdev(dev);
+	ipip6_dev_free(dev);
 failed:
 	return NULL;
 }
 
-static DEFINE_SPINLOCK(ipip6_prl_lock);
-
 #define for_each_prl_rcu(start)			\
 	for (prl = rcu_dereference(start);	\
 	     prl;				\
@@ -340,9 +368,9 @@ ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg)
 	if (a->addr == htonl(INADDR_ANY))
 		return -EINVAL;
 
-	spin_lock(&ipip6_prl_lock);
+	ASSERT_RTNL();
 
-	for (p = t->prl; p; p = p->next) {
+	for (p = rtnl_dereference(t->prl); p; p = rtnl_dereference(p->next)) {
 		if (p->addr == a->addr) {
 			if (chg) {
 				p->flags = a->flags;
@@ -370,7 +398,6 @@ ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg)
 	t->prl_count++;
 	rcu_assign_pointer(t->prl, p);
 out:
-	spin_unlock(&ipip6_prl_lock);
 	return err;
 }
 
@@ -397,7 +424,7 @@ ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
 	struct ip_tunnel_prl_entry *x, **p;
 	int err = 0;
 
-	spin_lock(&ipip6_prl_lock);
+	ASSERT_RTNL();
 
 	if (a && a->addr != htonl(INADDR_ANY)) {
 		for (p = &t->prl; *p; p = &(*p)->next) {
@@ -419,7 +446,6 @@ ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
 		}
 	}
 out:
-	spin_unlock(&ipip6_prl_lock);
 	return err;
 }
 
@@ -455,15 +481,12 @@ static void ipip6_tunnel_uninit(struct net_device *dev)
 	struct sit_net *sitn = net_generic(net, sit_net_id);
 
 	if (dev == sitn->fb_tunnel_dev) {
-		spin_lock_bh(&ipip6_lock);
-		sitn->tunnels_wc[0] = NULL;
-		spin_unlock_bh(&ipip6_lock);
-		dev_put(dev);
+		rcu_assign_pointer(sitn->tunnels_wc[0], NULL);
 	} else {
 		ipip6_tunnel_unlink(sitn, netdev_priv(dev));
 		ipip6_tunnel_del_prl(netdev_priv(dev), NULL);
-		dev_put(dev);
 	}
+	dev_put(dev);
 }
 
 
@@ -552,6 +575,8 @@ static int ipip6_rcv(struct sk_buff *skb)
 	tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
 				     iph->saddr, iph->daddr);
 	if (tunnel != NULL) {
+		struct pcpu_tstats *tstats;
+
 		secpath_reset(skb);
 		skb->mac_header = skb->network_header;
 		skb_reset_network_header(skb);
@@ -567,10 +592,16 @@ static int ipip6_rcv(struct sk_buff *skb)
 			return 0;
 		}
 
-		skb_tunnel_rx(skb, tunnel->dev);
+		tstats = this_cpu_ptr(tunnel->dev->tstats);
+		tstats->rx_packets++;
+		tstats->rx_bytes += skb->len;
+
+		__skb_tunnel_rx(skb, tunnel->dev);
 
 		ipip6_ecn_decapsulate(iph, skb);
+
 		netif_rx(skb);
+
 		rcu_read_unlock();
 		return 0;
 	}
@@ -594,7 +625,7 @@ __be32 try_6rd(struct in6_addr *v6dst, struct ip_tunnel *tunnel)
 #ifdef CONFIG_IPV6_SIT_6RD
 	if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix,
 			      tunnel->ip6rd.prefixlen)) {
-		unsigned pbw0, pbi0;
+		unsigned int pbw0, pbi0;
 		int pbi1;
 		u32 d;
 
@@ -629,14 +660,13 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 				     struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
-	struct net_device_stats *stats = &dev->stats;
-	struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+	struct pcpu_tstats *tstats;
 	struct iphdr  *tiph = &tunnel->parms.iph;
 	struct ipv6hdr *iph6 = ipv6_hdr(skb);
 	u8     tos = tunnel->parms.iph.tos;
 	__be16 df = tiph->frag_off;
 	struct rtable *rt;     			/* Route to the other host */
-	struct net_device *tdev;			/* Device to other host */
+	struct net_device *tdev;		/* Device to other host */
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
 	__be32 dst = tiph->daddr;
@@ -707,28 +737,28 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 				    .oif = tunnel->parms.link,
 				    .proto = IPPROTO_IPV6 };
 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
-			stats->tx_carrier_errors++;
+			dev->stats.tx_carrier_errors++;
 			goto tx_error_icmp;
 		}
 	}
 	if (rt->rt_type != RTN_UNICAST) {
 		ip_rt_put(rt);
-		stats->tx_carrier_errors++;
+		dev->stats.tx_carrier_errors++;
 		goto tx_error_icmp;
 	}
-	tdev = rt->u.dst.dev;
+	tdev = rt->dst.dev;
 
 	if (tdev == dev) {
 		ip_rt_put(rt);
-		stats->collisions++;
+		dev->stats.collisions++;
 		goto tx_error;
 	}
 
 	if (df) {
-		mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
+		mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
 
 		if (mtu < 68) {
-			stats->collisions++;
+			dev->stats.collisions++;
 			ip_rt_put(rt);
 			goto tx_error;
 		}
@@ -767,7 +797,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
 		if (!new_skb) {
 			ip_rt_put(rt);
-			txq->tx_dropped++;
+			dev->stats.tx_dropped++;
 			dev_kfree_skb(skb);
 			return NETDEV_TX_OK;
 		}
@@ -784,7 +814,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 	IPCB(skb)->flags = 0;
 	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 
 	/*
 	 *	Push down and install the IPIP header.
@@ -803,14 +833,14 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		iph->ttl	=	iph6->hop_limit;
 
 	nf_reset(skb);
-
-	IPTUNNEL_XMIT();
+	tstats = this_cpu_ptr(dev->tstats);
+	__IPTUNNEL_XMIT(tstats, &dev->stats);
 	return NETDEV_TX_OK;
 
 tx_error_icmp:
 	dst_link_failure(skb);
 tx_error:
-	stats->tx_errors++;
+	dev->stats.tx_errors++;
 	dev_kfree_skb(skb);
 	return NETDEV_TX_OK;
 }
@@ -833,7 +863,7 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev)
 				    .proto = IPPROTO_IPV6 };
 		struct rtable *rt;
 		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
-			tdev = rt->u.dst.dev;
+			tdev = rt->dst.dev;
 			ip_rt_put(rt);
 		}
 		dev->flags |= IFF_POINTOPOINT;
@@ -933,6 +963,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 				}
 				t = netdev_priv(dev);
 				ipip6_tunnel_unlink(sitn, t);
+				synchronize_net();
 				t->parms.iph.saddr = p.iph.saddr;
 				t->parms.iph.daddr = p.iph.daddr;
 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
@@ -1087,12 +1118,19 @@ static const struct net_device_ops ipip6_netdev_ops = {
 	.ndo_start_xmit	= ipip6_tunnel_xmit,
 	.ndo_do_ioctl	= ipip6_tunnel_ioctl,
 	.ndo_change_mtu	= ipip6_tunnel_change_mtu,
+	.ndo_get_stats	= ipip6_get_stats,
 };
 
+static void ipip6_dev_free(struct net_device *dev)
+{
+	free_percpu(dev->tstats);
+	free_netdev(dev);
+}
+
 static void ipip6_tunnel_setup(struct net_device *dev)
 {
 	dev->netdev_ops		= &ipip6_netdev_ops;
-	dev->destructor 	= free_netdev;
+	dev->destructor 	= ipip6_dev_free;
 
 	dev->type		= ARPHRD_SIT;
 	dev->hard_header_len 	= LL_MAX_HEADER + sizeof(struct iphdr);
@@ -1102,9 +1140,10 @@ static void ipip6_tunnel_setup(struct net_device *dev)
 	dev->iflink		= 0;
 	dev->addr_len		= 4;
 	dev->features		|= NETIF_F_NETNS_LOCAL;
+	dev->features		|= NETIF_F_LLTX;
 }
 
-static void ipip6_tunnel_init(struct net_device *dev)
+static int ipip6_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
@@ -1115,9 +1154,14 @@ static void ipip6_tunnel_init(struct net_device *dev)
 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
 
 	ipip6_tunnel_bind_dev(dev);
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
+	return 0;
 }
 
-static void __net_init ipip6_fb_tunnel_init(struct net_device *dev)
+static int __net_init ipip6_fb_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	struct iphdr *iph = &tunnel->parms.iph;
@@ -1132,11 +1176,15 @@ static void __net_init ipip6_fb_tunnel_init(struct net_device *dev)
 	iph->ihl		= 5;
 	iph->ttl		= 64;
 
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
 	dev_hold(dev);
 	sitn->tunnels_wc[0]	= tunnel;
+	return 0;
 }
 
-static struct xfrm_tunnel sit_handler = {
+static struct xfrm_tunnel sit_handler __read_mostly = {
 	.handler	=	ipip6_rcv,
 	.err_handler	=	ipip6_err,
 	.priority	=	1,
@@ -1177,7 +1225,10 @@ static int __net_init sit_init_net(struct net *net)
 	}
 	dev_net_set(sitn->fb_tunnel_dev, net);
 
-	ipip6_fb_tunnel_init(sitn->fb_tunnel_dev);
+	err = ipip6_fb_tunnel_init(sitn->fb_tunnel_dev);
+	if (err)
+		goto err_dev_free;
+
 	ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn);
 
 	if ((err = register_netdev(sitn->fb_tunnel_dev)))
@@ -1187,7 +1238,8 @@ static int __net_init sit_init_net(struct net *net)
 
 err_reg_dev:
 	dev_put(sitn->fb_tunnel_dev);
-	free_netdev(sitn->fb_tunnel_dev);
+err_dev_free:
+	ipip6_dev_free(sitn->fb_tunnel_dev);
 err_alloc_dev:
 	return err;
 }
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 34d1f0690d7..09fd34f0dbf 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -27,28 +27,17 @@ extern __u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS];
 #define COOKIEBITS 24	/* Upper bits store count */
 #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
 
-/*
- * This table has to be sorted and terminated with (__u16)-1.
- * XXX generate a better table.
- * Unresolved Issues: HIPPI with a 64k MSS is not well supported.
- *
- * Taken directly from ipv4 implementation.
- * Should this list be modified for ipv6 use or is it close enough?
- * rfc 2460 8.3 suggests mss values 20 bytes less than ipv4 counterpart
- */
+/* Table must be sorted. */
 static __u16 const msstab[] = {
-	64 - 1,
-	256 - 1,
-	512 - 1,
-	536 - 1,
-	1024 - 1,
-	1440 - 1,
-	1460 - 1,
-	4312 - 1,
-	(__u16)-1
+	64,
+	512,
+	536,
+	1280 - 60,
+	1480 - 60,
+	1500 - 60,
+	4460 - 60,
+	9000 - 60,
 };
-/* The number doesn't include the -1 terminator */
-#define NUM_MSS (ARRAY_SIZE(msstab) - 1)
 
 /*
  * This (misnamed) value is the age of syncookie which is permitted.
@@ -134,9 +123,11 @@ __u32 cookie_v6_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
 
 	tcp_synq_overflow(sk);
 
-	for (mssind = 0; mss > msstab[mssind + 1]; mssind++)
-		;
-	*mssp = msstab[mssind] + 1;
+	for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
+		if (mss >= msstab[mssind])
+			break;
+
+	*mssp = msstab[mssind];
 
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
 
@@ -154,7 +145,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
 					    th->source, th->dest, seq,
 					    jiffies / (HZ * 60), COUNTER_TRIES);
 
-	return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
+	return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
 }
 
 struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
@@ -173,8 +164,9 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	int mss;
 	struct dst_entry *dst;
 	__u8 rcv_wscale;
+	bool ecn_ok;
 
-	if (!sysctl_tcp_syncookies || !th->ack)
+	if (!sysctl_tcp_syncookies || !th->ack || th->rst)
 		goto out;
 
 	if (tcp_synq_no_recent_overflow(sk) ||
@@ -189,8 +181,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	memset(&tcp_opt, 0, sizeof(tcp_opt));
 	tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
 
-	if (tcp_opt.saw_tstamp)
-		cookie_check_timestamp(&tcp_opt);
+	if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
+		goto out;
 
 	ret = NULL;
 	req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
@@ -224,9 +216,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 
 	req->expires = 0UL;
 	req->retrans = 0;
-	ireq->ecn_ok		= 0;
+	ireq->ecn_ok		= ecn_ok;
 	ireq->snd_wscale	= tcp_opt.snd_wscale;
-	ireq->rcv_wscale	= tcp_opt.rcv_wscale;
 	ireq->sack_ok		= tcp_opt.sack_ok;
 	ireq->wscale_ok		= tcp_opt.wscale_ok;
 	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
@@ -240,17 +231,12 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	 * me if there is a preferred way.
 	 */
 	{
-		struct in6_addr *final_p = NULL, final;
+		struct in6_addr *final_p, final;
 		struct flowi fl;
 		memset(&fl, 0, sizeof(fl));
 		fl.proto = IPPROTO_TCP;
 		ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
-		if (np->opt && np->opt->srcrt) {
-			struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
-			ipv6_addr_copy(&final, &fl.fl6_dst);
-			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-			final_p = &final;
-		}
+		final_p = fl6_update_dst(&fl, np->opt, &final);
 		ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
 		fl.oif = sk->sk_bound_dev_if;
 		fl.mark = sk->sk_mark;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2b7c3a100e2..7e41e2cbb85 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -129,7 +129,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct in6_addr *saddr = NULL, *final_p = NULL, final;
+	struct in6_addr *saddr = NULL, *final_p, final;
 	struct flowi fl;
 	struct dst_entry *dst;
 	int addr_type;
@@ -139,7 +139,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		return -EINVAL;
 
 	if (usin->sin6_family != AF_INET6)
-		return(-EAFNOSUPPORT);
+		return -EAFNOSUPPORT;
 
 	memset(&fl, 0, sizeof(fl));
 
@@ -250,12 +250,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl.fl_ip_dport = usin->sin6_port;
 	fl.fl_ip_sport = inet->inet_sport;
 
-	if (np->opt && np->opt->srcrt) {
-		struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
-	}
+	final_p = fl6_update_dst(&fl, np->opt, &final);
 
 	security_sk_classify_flow(sk, &fl);
 
@@ -477,7 +472,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sk_buff * skb;
 	struct ipv6_txoptions *opt = NULL;
-	struct in6_addr * final_p = NULL, final;
+	struct in6_addr * final_p, final;
 	struct flowi fl;
 	struct dst_entry *dst;
 	int err = -1;
@@ -494,12 +489,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
 	security_req_classify_flow(req, &fl);
 
 	opt = np->opt;
-	if (opt && opt->srcrt) {
-		struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
-	}
+	final_p = fl6_update_dst(&fl, opt, &final);
 
 	err = ip6_dst_lookup(sk, &dst, &fl);
 	if (err)
@@ -1167,7 +1157,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
 	}
 
 #ifdef CONFIG_SYN_COOKIES
-	if (!th->rst && !th->syn && th->ack)
+	if (!th->syn)
 		sk = cookie_v6_check(sk, skb);
 #endif
 	return sk;
@@ -1279,13 +1269,10 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	treq = inet6_rsk(req);
 	ipv6_addr_copy(&treq->rmt_addr, &ipv6_hdr(skb)->saddr);
 	ipv6_addr_copy(&treq->loc_addr, &ipv6_hdr(skb)->daddr);
-	if (!want_cookie)
+	if (!want_cookie || tmp_opt.tstamp_ok)
 		TCP_ECN_create_request(req, tcp_hdr(skb));
 
-	if (want_cookie) {
-		isn = cookie_v6_init_sequence(sk, skb, &req->mss);
-		req->cookie_ts = tmp_opt.tstamp_ok;
-	} else if (!isn) {
+	if (!isn) {
 		if (ipv6_opt_accepted(sk, skb) ||
 		    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
 		    np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
@@ -1298,8 +1285,12 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 		if (!sk->sk_bound_dev_if &&
 		    ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL)
 			treq->iif = inet6_iif(skb);
-
-		isn = tcp_v6_init_sequence(skb);
+		if (!want_cookie) {
+			isn = tcp_v6_init_sequence(skb);
+		} else {
+			isn = cookie_v6_init_sequence(sk, skb, &req->mss);
+			req->cookie_ts = tmp_opt.tstamp_ok;
+		}
 	}
 	tcp_rsk(req)->snt_isn = isn;
 
@@ -1392,18 +1383,13 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		goto out_overflow;
 
 	if (dst == NULL) {
-		struct in6_addr *final_p = NULL, final;
+		struct in6_addr *final_p, final;
 		struct flowi fl;
 
 		memset(&fl, 0, sizeof(fl));
 		fl.proto = IPPROTO_TCP;
 		ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
-		if (opt && opt->srcrt) {
-			struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
-			ipv6_addr_copy(&final, &fl.fl6_dst);
-			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-			final_p = &final;
-		}
+		final_p = fl6_update_dst(&fl, opt, &final);
 		ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr);
 		fl.oif = sk->sk_bound_dev_if;
 		fl.mark = sk->sk_mark;
@@ -1423,7 +1409,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	newsk = tcp_create_openreq_child(sk, req, skb);
 	if (newsk == NULL)
-		goto out;
+		goto out_nonewsk;
 
 	/*
 	 * No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -1511,18 +1497,22 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	}
 #endif
 
+	if (__inet_inherit_port(sk, newsk) < 0) {
+		sock_put(newsk);
+		goto out;
+	}
 	__inet6_hash(newsk, NULL);
-	__inet_inherit_port(sk, newsk);
 
 	return newsk;
 
 out_overflow:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
-out:
-	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+out_nonewsk:
 	if (opt && opt != np->opt)
 		sock_kfree_s(sk, opt, opt->tot_len);
 	dst_release(dst);
+out:
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
 	return NULL;
 }
 
@@ -2156,6 +2146,8 @@ struct proto tcpv6_prot = {
 	.setsockopt		= tcp_setsockopt,
 	.getsockopt		= tcp_getsockopt,
 	.recvmsg		= tcp_recvmsg,
+	.sendmsg		= tcp_sendmsg,
+	.sendpage		= tcp_sendpage,
 	.backlog_rcv		= tcp_v6_do_rcv,
 	.hash			= tcp_v6_hash,
 	.unhash			= inet_unhash,
@@ -2174,6 +2166,7 @@ struct proto tcpv6_prot = {
 	.twsk_prot		= &tcp6_timewait_sock_ops,
 	.rsk_prot		= &tcp6_request_sock_ops,
 	.h.hashinfo		= &tcp_hashinfo,
+	.no_autobind		= true,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt	= compat_tcp_setsockopt,
 	.compat_getsockopt	= compat_tcp_getsockopt,
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index fc3c86a4745..4f3cec12aa8 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -30,28 +30,31 @@
 #include <net/protocol.h>
 #include <net/xfrm.h>
 
-static struct xfrm6_tunnel *tunnel6_handlers;
-static struct xfrm6_tunnel *tunnel46_handlers;
+static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly;
+static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly;
 static DEFINE_MUTEX(tunnel6_mutex);
 
 int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
 {
-	struct xfrm6_tunnel **pprev;
+	struct xfrm6_tunnel __rcu **pprev;
+	struct xfrm6_tunnel *t;
 	int ret = -EEXIST;
 	int priority = handler->priority;
 
 	mutex_lock(&tunnel6_mutex);
 
 	for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
-	     *pprev; pprev = &(*pprev)->next) {
-		if ((*pprev)->priority > priority)
+	     (t = rcu_dereference_protected(*pprev,
+			lockdep_is_held(&tunnel6_mutex))) != NULL;
+	     pprev = &t->next) {
+		if (t->priority > priority)
 			break;
-		if ((*pprev)->priority == priority)
+		if (t->priority == priority)
 			goto err;
 	}
 
 	handler->next = *pprev;
-	*pprev = handler;
+	rcu_assign_pointer(*pprev, handler);
 
 	ret = 0;
 
@@ -65,14 +68,17 @@ EXPORT_SYMBOL(xfrm6_tunnel_register);
 
 int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
 {
-	struct xfrm6_tunnel **pprev;
+	struct xfrm6_tunnel __rcu **pprev;
+	struct xfrm6_tunnel *t;
 	int ret = -ENOENT;
 
 	mutex_lock(&tunnel6_mutex);
 
 	for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
-	     *pprev; pprev = &(*pprev)->next) {
-		if (*pprev == handler) {
+	     (t = rcu_dereference_protected(*pprev,
+			lockdep_is_held(&tunnel6_mutex))) != NULL;
+	     pprev = &t->next) {
+		if (t == handler) {
 			*pprev = handler->next;
 			ret = 0;
 			break;
@@ -88,6 +94,11 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
 
 EXPORT_SYMBOL(xfrm6_tunnel_deregister);
 
+#define for_each_tunnel_rcu(head, handler)		\
+	for (handler = rcu_dereference(head);		\
+	     handler != NULL;				\
+	     handler = rcu_dereference(handler->next))	\
+
 static int tunnel6_rcv(struct sk_buff *skb)
 {
 	struct xfrm6_tunnel *handler;
@@ -95,7 +106,7 @@ static int tunnel6_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
 		goto drop;
 
-	for (handler = tunnel6_handlers; handler; handler = handler->next)
+	for_each_tunnel_rcu(tunnel6_handlers, handler)
 		if (!handler->handler(skb))
 			return 0;
 
@@ -113,7 +124,7 @@ static int tunnel46_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
 		goto drop;
 
-	for (handler = tunnel46_handlers; handler; handler = handler->next)
+	for_each_tunnel_rcu(tunnel46_handlers, handler)
 		if (!handler->handler(skb))
 			return 0;
 
@@ -129,7 +140,7 @@ static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 {
 	struct xfrm6_tunnel *handler;
 
-	for (handler = tunnel6_handlers; handler; handler = handler->next)
+	for_each_tunnel_rcu(tunnel6_handlers, handler)
 		if (!handler->err_handler(skb, opt, type, code, offset, info))
 			break;
 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 87be58673b5..91def93bec8 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -111,10 +111,19 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
 	return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr);
 }
 
+static void udp_v6_rehash(struct sock *sk)
+{
+	u16 new_hash = udp6_portaddr_hash(sock_net(sk),
+					  &inet6_sk(sk)->rcv_saddr,
+					  inet_sk(sk)->inet_num);
+
+	udp_lib_rehash(sk, new_hash);
+}
+
 static inline int compute_score(struct sock *sk, struct net *net,
 				unsigned short hnum,
-				struct in6_addr *saddr, __be16 sport,
-				struct in6_addr *daddr, __be16 dport,
+				const struct in6_addr *saddr, __be16 sport,
+				const struct in6_addr *daddr, __be16 dport,
 				int dif)
 {
 	int score = -1;
@@ -230,8 +239,8 @@ exact_match:
 }
 
 static struct sock *__udp6_lib_lookup(struct net *net,
-				      struct in6_addr *saddr, __be16 sport,
-				      struct in6_addr *daddr, __be16 dport,
+				      const struct in6_addr *saddr, __be16 sport,
+				      const struct in6_addr *daddr, __be16 dport,
 				      int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result;
@@ -311,6 +320,14 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
 				 udptable);
 }
 
+struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
+			     const struct in6_addr *daddr, __be16 dport, int dif)
+{
+	return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
+}
+EXPORT_SYMBOL_GPL(udp6_lib_lookup);
+
+
 /*
  * 	This should be easy, if there is something there we
  * 	return it, otherwise we block.
@@ -510,7 +527,7 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
 		}
 	}
 
-	if (sk->sk_filter) {
+	if (rcu_dereference_raw(sk->sk_filter)) {
 		if (udp_lib_checksum_complete(skb))
 			goto drop;
 	}
@@ -927,7 +944,7 @@ int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name;
-	struct in6_addr *daddr, *final_p = NULL, final;
+	struct in6_addr *daddr, *final_p, final;
 	struct ipv6_txoptions *opt = NULL;
 	struct ip6_flowlabel *flowlabel = NULL;
 	struct flowi fl;
@@ -1097,14 +1114,9 @@ do_udp_sendmsg:
 		ipv6_addr_copy(&fl.fl6_src, &np->saddr);
 	fl.fl_ip_sport = inet->inet_sport;
 
-	/* merge ip6_build_xmit from ip6_output */
-	if (opt && opt->srcrt) {
-		struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
-		ipv6_addr_copy(&final, &fl.fl6_dst);
-		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
-		final_p = &final;
+	final_p = fl6_update_dst(&fl, opt, &final);
+	if (final_p)
 		connected = 0;
-	}
 
 	if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) {
 		fl.oif = np->mcast_oif;
@@ -1452,6 +1464,7 @@ struct proto udpv6_prot = {
 	.backlog_rcv	   = udpv6_queue_rcv_skb,
 	.hash		   = udp_lib_hash,
 	.unhash		   = udp_lib_unhash,
+	.rehash		   = udp_v6_rehash,
 	.get_port	   = udp_v6_get_port,
 	.memory_allocated  = &udp_memory_allocated,
 	.sysctl_mem	   = sysctl_udp_mem,
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 6baeabbbca8..7e74023ea6e 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -199,7 +199,7 @@ static inline int xfrm6_garbage_collect(struct dst_ops *ops)
 	struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops);
 
 	xfrm6_policy_afinfo.garbage_collect(net);
-	return (atomic_read(&ops->entries) > ops->gc_thresh * 2);
+	return dst_entries_get_fast(ops) > ops->gc_thresh * 2;
 }
 
 static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -255,7 +255,6 @@ static struct dst_ops xfrm6_dst_ops = {
 	.ifdown =		xfrm6_dst_ifdown,
 	.local_out =		__ip6_local_out,
 	.gc_thresh =		1024,
-	.entries =		ATOMIC_INIT(0),
 };
 
 static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
@@ -312,11 +311,13 @@ int __init xfrm6_init(void)
 	 */
 	gc_thresh = FIB6_TABLE_HASHSZ * 8;
 	xfrm6_dst_ops.gc_thresh = (gc_thresh < 1024) ? 1024 : gc_thresh;
+	dst_entries_init(&xfrm6_dst_ops);
 
 	ret = xfrm6_policy_init();
-	if (ret)
+	if (ret) {
+		dst_entries_destroy(&xfrm6_dst_ops);
 		goto out;
-
+	}
 	ret = xfrm6_state_init();
 	if (ret)
 		goto out_policy;
@@ -341,4 +342,5 @@ void xfrm6_fini(void)
 	//xfrm6_input_fini();
 	xfrm6_policy_fini();
 	xfrm6_state_fini();
+	dst_entries_destroy(&xfrm6_dst_ops);
 }
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index f417b77fa0e..a67575d472a 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -20,23 +20,27 @@
 #include <net/addrconf.h>
 
 static void
-__xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl,
-		     struct xfrm_tmpl *tmpl,
-		     xfrm_address_t *daddr, xfrm_address_t *saddr)
+__xfrm6_init_tempsel(struct xfrm_selector *sel, struct flowi *fl)
 {
 	/* Initialize temporary selector matching only
 	 * to current session. */
-	ipv6_addr_copy((struct in6_addr *)&x->sel.daddr, &fl->fl6_dst);
-	ipv6_addr_copy((struct in6_addr *)&x->sel.saddr, &fl->fl6_src);
-	x->sel.dport = xfrm_flowi_dport(fl);
-	x->sel.dport_mask = htons(0xffff);
-	x->sel.sport = xfrm_flowi_sport(fl);
-	x->sel.sport_mask = htons(0xffff);
-	x->sel.family = AF_INET6;
-	x->sel.prefixlen_d = 128;
-	x->sel.prefixlen_s = 128;
-	x->sel.proto = fl->proto;
-	x->sel.ifindex = fl->oif;
+	ipv6_addr_copy((struct in6_addr *)&sel->daddr, &fl->fl6_dst);
+	ipv6_addr_copy((struct in6_addr *)&sel->saddr, &fl->fl6_src);
+	sel->dport = xfrm_flowi_dport(fl);
+	sel->dport_mask = htons(0xffff);
+	sel->sport = xfrm_flowi_sport(fl);
+	sel->sport_mask = htons(0xffff);
+	sel->family = AF_INET6;
+	sel->prefixlen_d = 128;
+	sel->prefixlen_s = 128;
+	sel->proto = fl->proto;
+	sel->ifindex = fl->oif;
+}
+
+static void
+xfrm6_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl,
+		   xfrm_address_t *daddr, xfrm_address_t *saddr)
+{
 	x->id = tmpl->id;
 	if (ipv6_addr_any((struct in6_addr*)&x->id.daddr))
 		memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
@@ -168,6 +172,7 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.eth_proto		= htons(ETH_P_IPV6),
 	.owner			= THIS_MODULE,
 	.init_tempsel		= __xfrm6_init_tempsel,
+	.init_temprop		= xfrm6_init_temprop,
 	.tmpl_sort		= __xfrm6_tmpl_sort,
 	.state_sort		= __xfrm6_state_sort,
 	.output			= xfrm6_output,
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 2ce3a8278f2..2969cad408d 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -199,7 +199,7 @@ static void x6spi_destroy_rcu(struct rcu_head *head)
 			container_of(head, struct xfrm6_tunnel_spi, rcu_head));
 }
 
-void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr)
+static void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr)
 {
 	struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
 	struct xfrm6_tunnel_spi *x6spi;
@@ -223,8 +223,6 @@ void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr)
 	spin_unlock_bh(&xfrm6_tunnel_spi_lock);
 }
 
-EXPORT_SYMBOL(xfrm6_tunnel_free_spi);
-
 static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 {
 	skb_push(skb, -skb_network_offset(skb));
@@ -317,13 +315,13 @@ static const struct xfrm_type xfrm6_tunnel_type = {
 	.output		= xfrm6_tunnel_output,
 };
 
-static struct xfrm6_tunnel xfrm6_tunnel_handler = {
+static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = {
 	.handler	= xfrm6_tunnel_rcv,
 	.err_handler	= xfrm6_tunnel_err,
 	.priority	= 2,
 };
 
-static struct xfrm6_tunnel xfrm46_tunnel_handler = {
+static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = {
 	.handler	= xfrm6_tunnel_rcv,
 	.err_handler	= xfrm6_tunnel_err,
 	.priority	= 2,
diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig
index e9ad0062fbb..02549cb2c32 100644
--- a/net/ipx/Kconfig
+++ b/net/ipx/Kconfig
@@ -3,6 +3,7 @@
 #
 config IPX
 	tristate "The IPX protocol"
+	depends on BKL # should be fixable
 	select LLC
 	---help---
 	  This is support for the Novell networking protocol, IPX, commonly
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 79986a674f6..7f097989cde 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -573,9 +573,9 @@ static int irda_find_lsap_sel(struct irda_sock *self, char *name)
 		/* Requested object/attribute doesn't exist */
 		if((self->errno == IAS_CLASS_UNKNOWN) ||
 		   (self->errno == IAS_ATTRIB_UNKNOWN))
-			return (-EADDRNOTAVAIL);
+			return -EADDRNOTAVAIL;
 		else
-			return (-EHOSTUNREACH);
+			return -EHOSTUNREACH;
 	}
 
 	/* Get the remote TSAP selector */
@@ -663,7 +663,7 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name)
 					   __func__, name);
 				self->daddr = DEV_ADDR_ANY;
 				kfree(discoveries);
-				return(-ENOTUNIQ);
+				return -ENOTUNIQ;
 			}
 			/* First time we found that one, save it ! */
 			daddr = self->daddr;
@@ -677,7 +677,7 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name)
 			IRDA_DEBUG(0, "%s(), unexpected IAS query failure\n", __func__);
 			self->daddr = DEV_ADDR_ANY;
 			kfree(discoveries);
-			return(-EHOSTUNREACH);
+			return -EHOSTUNREACH;
 			break;
 		}
 	}
@@ -689,7 +689,7 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name)
 		IRDA_DEBUG(1, "%s(), cannot discover service ''%s'' in any device !!!\n",
 			   __func__, name);
 		self->daddr = DEV_ADDR_ANY;
-		return(-EADDRNOTAVAIL);
+		return -EADDRNOTAVAIL;
 	}
 
 	/* Revert back to discovered device & service */
@@ -715,14 +715,11 @@ static int irda_getname(struct socket *sock, struct sockaddr *uaddr,
 	struct sockaddr_irda saddr;
 	struct sock *sk = sock->sk;
 	struct irda_sock *self = irda_sk(sk);
-	int err;
 
-	lock_kernel();
 	memset(&saddr, 0, sizeof(saddr));
 	if (peer) {
-		err  = -ENOTCONN;
 		if (sk->sk_state != TCP_ESTABLISHED)
-			goto out;
+			return -ENOTCONN;
 
 		saddr.sir_family = AF_IRDA;
 		saddr.sir_lsap_sel = self->dtsap_sel;
@@ -739,10 +736,8 @@ static int irda_getname(struct socket *sock, struct sockaddr *uaddr,
 	/* uaddr_len come to us uninitialised */
 	*uaddr_len = sizeof (struct sockaddr_irda);
 	memcpy(uaddr, &saddr, *uaddr_len);
-	err = 0;
-out:
-	unlock_kernel();
-	return err;
+
+	return 0;
 }
 
 /*
@@ -758,7 +753,8 @@ static int irda_listen(struct socket *sock, int backlog)
 
 	IRDA_DEBUG(2, "%s()\n", __func__);
 
-	lock_kernel();
+	lock_sock(sk);
+
 	if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) &&
 	    (sk->sk_type != SOCK_DGRAM))
 		goto out;
@@ -770,7 +766,7 @@ static int irda_listen(struct socket *sock, int backlog)
 		err = 0;
 	}
 out:
-	unlock_kernel();
+	release_sock(sk);
 
 	return err;
 }
@@ -793,7 +789,7 @@ static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (addr_len != sizeof(struct sockaddr_irda))
 		return -EINVAL;
 
-	lock_kernel();
+	lock_sock(sk);
 #ifdef CONFIG_IRDA_ULTRA
 	/* Special care for Ultra sockets */
 	if ((sk->sk_type == SOCK_DGRAM) &&
@@ -824,8 +820,8 @@ static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 	err = irda_open_tsap(self, addr->sir_lsap_sel, addr->sir_name);
 	if (err < 0) {
-		kfree(self->ias_obj->name);
-		kfree(self->ias_obj);
+		irias_delete_object(self->ias_obj);
+		self->ias_obj = NULL;
 		goto out;
 	}
 
@@ -836,7 +832,7 @@ static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 	err = 0;
 out:
-	unlock_kernel();
+	release_sock(sk);
 	return err;
 }
 
@@ -856,12 +852,13 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
 
 	IRDA_DEBUG(2, "%s()\n", __func__);
 
-	lock_kernel();
 	err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0);
 	if (err)
-		goto out;
+		return err;
 
 	err = -EINVAL;
+
+	lock_sock(sk);
 	if (sock->state != SS_UNCONNECTED)
 		goto out;
 
@@ -947,7 +944,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags)
 	irda_connect_response(new);
 	err = 0;
 out:
-	unlock_kernel();
+	release_sock(sk);
 	return err;
 }
 
@@ -981,7 +978,7 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr,
 
 	IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
 
-	lock_kernel();
+	lock_sock(sk);
 	/* Don't allow connect for Ultra sockets */
 	err = -ESOCKTNOSUPPORT;
 	if ((sk->sk_type == SOCK_DGRAM) && (sk->sk_protocol == IRDAPROTO_ULTRA))
@@ -1072,6 +1069,8 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr,
 
 	if (sk->sk_state != TCP_ESTABLISHED) {
 		sock->state = SS_UNCONNECTED;
+		if (sk->sk_prot->disconnect(sk, flags))
+			sock->state = SS_DISCONNECTING;
 		err = sock_error(sk);
 		if (!err)
 			err = -ECONNRESET;
@@ -1084,7 +1083,7 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr,
 	self->saddr = irttp_get_saddr(self->tsap);
 	err = 0;
 out:
-	unlock_kernel();
+	release_sock(sk);
 	return err;
 }
 
@@ -1231,7 +1230,6 @@ static int irda_release(struct socket *sock)
 	if (sk == NULL)
 		return 0;
 
-	lock_kernel();
 	lock_sock(sk);
 	sk->sk_state       = TCP_CLOSE;
 	sk->sk_shutdown   |= SEND_SHUTDOWN;
@@ -1250,7 +1248,6 @@ static int irda_release(struct socket *sock)
 	/* Destroy networking socket if we are the last reference on it,
 	 * i.e. if(sk->sk_refcnt == 0) -> sk_free(sk) */
 	sock_put(sk);
-	unlock_kernel();
 
 	/* Notes on socket locking and deallocation... - Jean II
 	 * In theory we should put pairs of sock_hold() / sock_put() to
@@ -1298,7 +1295,6 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock,
 
 	IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len);
 
-	lock_kernel();
 	/* Note : socket.c set MSG_EOR on SEQPACKET sockets */
 	if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_EOR | MSG_CMSG_COMPAT |
 			       MSG_NOSIGNAL)) {
@@ -1306,6 +1302,8 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock,
 		goto out;
 	}
 
+	lock_sock(sk);
+
 	if (sk->sk_shutdown & SEND_SHUTDOWN)
 		goto out_err;
 
@@ -1361,14 +1359,14 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock,
 		goto out_err;
 	}
 
-	unlock_kernel();
+	release_sock(sk);
 	/* Tell client how much data we actually sent */
 	return len;
 
 out_err:
 	err = sk_stream_error(sk, msg->msg_flags, err);
 out:
-	unlock_kernel();
+	release_sock(sk);
 	return err;
 
 }
@@ -1390,14 +1388,10 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock,
 
 	IRDA_DEBUG(4, "%s()\n", __func__);
 
-	lock_kernel();
-	if ((err = sock_error(sk)) < 0)
-		goto out;
-
 	skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
 				flags & MSG_DONTWAIT, &err);
 	if (!skb)
-		goto out;
+		return err;
 
 	skb_reset_transport_header(skb);
 	copied = skb->len;
@@ -1425,12 +1419,8 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock,
 			irttp_flow_request(self->tsap, FLOW_START);
 		}
 	}
-	unlock_kernel();
-	return copied;
 
-out:
-	unlock_kernel();
-	return err;
+	return copied;
 }
 
 /*
@@ -1448,17 +1438,15 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
 
 	IRDA_DEBUG(3, "%s()\n", __func__);
 
-	lock_kernel();
 	if ((err = sock_error(sk)) < 0)
-		goto out;
+		return err;
 
-	err = -EINVAL;
 	if (sock->flags & __SO_ACCEPTCON)
-		goto out;
+		return -EINVAL;
 
 	err =-EOPNOTSUPP;
 	if (flags & MSG_OOB)
-		goto out;
+		return -EOPNOTSUPP;
 
 	err = 0;
 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
@@ -1500,7 +1488,7 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
 			finish_wait(sk_sleep(sk), &wait);
 
 			if (err)
-				goto out;
+				return err;
 			if (sk->sk_shutdown & RCV_SHUTDOWN)
 				break;
 
@@ -1553,9 +1541,7 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock,
 		}
 	}
 
-out:
-	unlock_kernel();
-	return err ? : copied;
+	return copied;
 }
 
 /*
@@ -1573,13 +1559,12 @@ static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock,
 	struct sk_buff *skb;
 	int err;
 
-	lock_kernel();
-
 	IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len);
 
-	err = -EINVAL;
 	if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
-		goto out;
+		return -EINVAL;
+
+	lock_sock(sk);
 
 	if (sk->sk_shutdown & SEND_SHUTDOWN) {
 		send_sig(SIGPIPE, current, 0);
@@ -1630,10 +1615,12 @@ static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock,
 		IRDA_DEBUG(0, "%s(), err=%d\n", __func__, err);
 		goto out;
 	}
-	unlock_kernel();
+
+	release_sock(sk);
 	return len;
+
 out:
-	unlock_kernel();
+	release_sock(sk);
 	return err;
 }
 
@@ -1656,10 +1643,11 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock,
 
 	IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len);
 
-	lock_kernel();
 	err = -EINVAL;
 	if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
-		goto out;
+		return -EINVAL;
+
+	lock_sock(sk);
 
 	err = -EPIPE;
 	if (sk->sk_shutdown & SEND_SHUTDOWN) {
@@ -1732,7 +1720,7 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock,
 	if (err)
 		IRDA_DEBUG(0, "%s(), err=%d\n", __func__, err);
 out:
-	unlock_kernel();
+	release_sock(sk);
 	return err ? : len;
 }
 #endif /* CONFIG_IRDA_ULTRA */
@@ -1747,7 +1735,7 @@ static int irda_shutdown(struct socket *sock, int how)
 
 	IRDA_DEBUG(1, "%s(%p)\n", __func__, self);
 
-	lock_kernel();
+	lock_sock(sk);
 
 	sk->sk_state       = TCP_CLOSE;
 	sk->sk_shutdown   |= SEND_SHUTDOWN;
@@ -1769,7 +1757,7 @@ static int irda_shutdown(struct socket *sock, int how)
 	self->daddr = DEV_ADDR_ANY;	/* Until we get re-connected */
 	self->saddr = 0x0;		/* so IrLMP assign us any link */
 
-	unlock_kernel();
+	release_sock(sk);
 
 	return 0;
 }
@@ -1786,7 +1774,6 @@ static unsigned int irda_poll(struct file * file, struct socket *sock,
 
 	IRDA_DEBUG(4, "%s()\n", __func__);
 
-	lock_kernel();
 	poll_wait(file, sk_sleep(sk), wait);
 	mask = 0;
 
@@ -1834,20 +1821,8 @@ static unsigned int irda_poll(struct file * file, struct socket *sock,
 	default:
 		break;
 	}
-	unlock_kernel();
-	return mask;
-}
-
-static unsigned int irda_datagram_poll(struct file *file, struct socket *sock,
-			   poll_table *wait)
-{
-	int err;
-
-	lock_kernel();
-	err = datagram_poll(file, sock, wait);
-	unlock_kernel();
 
-	return err;
+	return mask;
 }
 
 /*
@@ -1860,7 +1835,6 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 
 	IRDA_DEBUG(4, "%s(), cmd=%#x\n", __func__, cmd);
 
-	lock_kernel();
 	err = -EINVAL;
 	switch (cmd) {
 	case TIOCOUTQ: {
@@ -1903,7 +1877,6 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __func__);
 		err = -ENOIOCTLCMD;
 	}
-	unlock_kernel();
 
 	return err;
 }
@@ -1927,7 +1900,7 @@ static int irda_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon
  *    Set some options for the socket
  *
  */
-static int __irda_setsockopt(struct socket *sock, int level, int optname,
+static int irda_setsockopt(struct socket *sock, int level, int optname,
 			   char __user *optval, unsigned int optlen)
 {
 	struct sock *sk = sock->sk;
@@ -1935,13 +1908,15 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
 	struct irda_ias_set    *ias_opt;
 	struct ias_object      *ias_obj;
 	struct ias_attrib *	ias_attr;	/* Attribute in IAS object */
-	int opt, free_ias = 0;
+	int opt, free_ias = 0, err = 0;
 
 	IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
 
 	if (level != SOL_IRLMP)
 		return -ENOPROTOOPT;
 
+	lock_sock(sk);
+
 	switch (optname) {
 	case IRLMP_IAS_SET:
 		/* The user want to add an attribute to an existing IAS object
@@ -1951,17 +1926,22 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
 		 * create the right attribute...
 		 */
 
-		if (optlen != sizeof(struct irda_ias_set))
-			return -EINVAL;
+		if (optlen != sizeof(struct irda_ias_set)) {
+			err = -EINVAL;
+			goto out;
+		}
 
 		ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
-		if (ias_opt == NULL)
-			return -ENOMEM;
+		if (ias_opt == NULL) {
+			err = -ENOMEM;
+			goto out;
+		}
 
 		/* Copy query to the driver. */
 		if (copy_from_user(ias_opt, optval, optlen)) {
 			kfree(ias_opt);
-			return -EFAULT;
+			err = -EFAULT;
+			goto out;
 		}
 
 		/* Find the object we target.
@@ -1971,7 +1951,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
 		if(ias_opt->irda_class_name[0] == '\0') {
 			if(self->ias_obj == NULL) {
 				kfree(ias_opt);
-				return -EINVAL;
+				err = -EINVAL;
+				goto out;
 			}
 			ias_obj = self->ias_obj;
 		} else
@@ -1983,7 +1964,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
 		if((!capable(CAP_NET_ADMIN)) &&
 		   ((ias_obj == NULL) || (ias_obj != self->ias_obj))) {
 			kfree(ias_opt);
-			return -EPERM;
+			err = -EPERM;
+			goto out;
 		}
 
 		/* If the object doesn't exist, create it */
@@ -1993,7 +1975,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
 						   jiffies);
 			if (ias_obj == NULL) {
 				kfree(ias_opt);
-				return -ENOMEM;
+				err = -ENOMEM;
+				goto out;
 			}
 			free_ias = 1;
 		}
@@ -2005,7 +1988,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
 				kfree(ias_obj->name);
 				kfree(ias_obj);
 			}
-			return -EINVAL;
+			err = -EINVAL;
+			goto out;
 		}
 
 		/* Look at the type */
@@ -2028,7 +2012,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
 					kfree(ias_obj);
 				}
 
-				return -EINVAL;
+				err = -EINVAL;
+				goto out;
 			}
 			/* Add an octet sequence attribute */
 			irias_add_octseq_attrib(
@@ -2060,7 +2045,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
 				kfree(ias_obj->name);
 				kfree(ias_obj);
 			}
-			return -EINVAL;
+			err = -EINVAL;
+			goto out;
 		}
 		irias_insert_object(ias_obj);
 		kfree(ias_opt);
@@ -2071,17 +2057,22 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
 		 * object is not owned by the kernel and delete it.
 		 */
 
-		if (optlen != sizeof(struct irda_ias_set))
-			return -EINVAL;
+		if (optlen != sizeof(struct irda_ias_set)) {
+			err = -EINVAL;
+			goto out;
+		}
 
 		ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
-		if (ias_opt == NULL)
-			return -ENOMEM;
+		if (ias_opt == NULL) {
+			err = -ENOMEM;
+			goto out;
+		}
 
 		/* Copy query to the driver. */
 		if (copy_from_user(ias_opt, optval, optlen)) {
 			kfree(ias_opt);
-			return -EFAULT;
+			err = -EFAULT;
+			goto out;
 		}
 
 		/* Find the object we target.
@@ -2094,7 +2085,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
 			ias_obj = irias_find_object(ias_opt->irda_class_name);
 		if(ias_obj == (struct ias_object *) NULL) {
 			kfree(ias_opt);
-			return -EINVAL;
+			err = -EINVAL;
+			goto out;
 		}
 
 		/* Only ROOT can mess with the global IAS database.
@@ -2103,7 +2095,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
 		if((!capable(CAP_NET_ADMIN)) &&
 		   ((ias_obj == NULL) || (ias_obj != self->ias_obj))) {
 			kfree(ias_opt);
-			return -EPERM;
+			err = -EPERM;
+			goto out;
 		}
 
 		/* Find the attribute (in the object) we target */
@@ -2111,14 +2104,16 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
 					     ias_opt->irda_attrib_name);
 		if(ias_attr == (struct ias_attrib *) NULL) {
 			kfree(ias_opt);
-			return -EINVAL;
+			err = -EINVAL;
+			goto out;
 		}
 
 		/* Check is the user space own the object */
 		if(ias_attr->value->owner != IAS_USER_ATTR) {
 			IRDA_DEBUG(1, "%s(), attempting to delete a kernel attribute\n", __func__);
 			kfree(ias_opt);
-			return -EPERM;
+			err = -EPERM;
+			goto out;
 		}
 
 		/* Remove the attribute (and maybe the object) */
@@ -2126,11 +2121,15 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
 		kfree(ias_opt);
 		break;
 	case IRLMP_MAX_SDU_SIZE:
-		if (optlen < sizeof(int))
-			return -EINVAL;
+		if (optlen < sizeof(int)) {
+			err = -EINVAL;
+			goto out;
+		}
 
-		if (get_user(opt, (int __user *)optval))
-			return -EFAULT;
+		if (get_user(opt, (int __user *)optval)) {
+			err = -EFAULT;
+			goto out;
+		}
 
 		/* Only possible for a seqpacket service (TTP with SAR) */
 		if (sk->sk_type != SOCK_SEQPACKET) {
@@ -2140,16 +2139,21 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
 		} else {
 			IRDA_WARNING("%s: not allowed to set MAXSDUSIZE for this socket type!\n",
 				     __func__);
-			return -ENOPROTOOPT;
+			err = -ENOPROTOOPT;
+			goto out;
 		}
 		break;
 	case IRLMP_HINTS_SET:
-		if (optlen < sizeof(int))
-			return -EINVAL;
+		if (optlen < sizeof(int)) {
+			err = -EINVAL;
+			goto out;
+		}
 
 		/* The input is really a (__u8 hints[2]), easier as an int */
-		if (get_user(opt, (int __user *)optval))
-			return -EFAULT;
+		if (get_user(opt, (int __user *)optval)) {
+			err = -EFAULT;
+			goto out;
+		}
 
 		/* Unregister any old registration */
 		if (self->skey)
@@ -2163,12 +2167,16 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
 		 * making a discovery (nodes which don't match any hint
 		 * bit in the mask are not reported).
 		 */
-		if (optlen < sizeof(int))
-			return -EINVAL;
+		if (optlen < sizeof(int)) {
+			err = -EINVAL;
+			goto out;
+		}
 
 		/* The input is really a (__u8 hints[2]), easier as an int */
-		if (get_user(opt, (int __user *)optval))
-			return -EFAULT;
+		if (get_user(opt, (int __user *)optval)) {
+			err = -EFAULT;
+			goto out;
+		}
 
 		/* Set the new hint mask */
 		self->mask.word = (__u16) opt;
@@ -2180,19 +2188,12 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname,
 
 		break;
 	default:
-		return -ENOPROTOOPT;
+		err = -ENOPROTOOPT;
+		break;
 	}
-	return 0;
-}
-
-static int irda_setsockopt(struct socket *sock, int level, int optname,
-			   char __user *optval, unsigned int optlen)
-{
-	int err;
 
-	lock_kernel();
-	err = __irda_setsockopt(sock, level, optname, optval, optlen);
-	unlock_kernel();
+out:
+	release_sock(sk);
 
 	return err;
 }
@@ -2249,7 +2250,7 @@ static int irda_extract_ias_value(struct irda_ias_set *ias_opt,
 /*
  * Function irda_getsockopt (sock, level, optname, optval, optlen)
  */
-static int __irda_getsockopt(struct socket *sock, int level, int optname,
+static int irda_getsockopt(struct socket *sock, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
 	struct sock *sk = sock->sk;
@@ -2262,7 +2263,7 @@ static int __irda_getsockopt(struct socket *sock, int level, int optname,
 	int daddr = DEV_ADDR_ANY;	/* Dest address for IAS queries */
 	int val = 0;
 	int len = 0;
-	int err;
+	int err = 0;
 	int offset, total;
 
 	IRDA_DEBUG(2, "%s(%p)\n", __func__, self);
@@ -2276,15 +2277,18 @@ static int __irda_getsockopt(struct socket *sock, int level, int optname,
 	if(len < 0)
 		return -EINVAL;
 
+	lock_sock(sk);
+
 	switch (optname) {
 	case IRLMP_ENUMDEVICES:
 		/* Ask lmp for the current discovery log */
 		discoveries = irlmp_get_discoveries(&list.len, self->mask.word,
 						    self->nslots);
 		/* Check if the we got some results */
-		if (discoveries == NULL)
-			return -EAGAIN;		/* Didn't find any devices */
-		err = 0;
+		if (discoveries == NULL) {
+			err = -EAGAIN;
+			goto out;		/* Didn't find any devices */
+		}
 
 		/* Write total list length back to client */
 		if (copy_to_user(optval, &list,
@@ -2297,8 +2301,7 @@ static int __irda_getsockopt(struct socket *sock, int level, int optname,
 			sizeof(struct irda_device_info);
 
 		/* Copy the list itself - watch for overflow */
-		if(list.len > 2048)
-		{
+		if (list.len > 2048) {
 			err = -EINVAL;
 			goto bed;
 		}
@@ -2314,17 +2317,20 @@ static int __irda_getsockopt(struct socket *sock, int level, int optname,
 bed:
 		/* Free up our buffer */
 		kfree(discoveries);
-		if (err)
-			return err;
 		break;
 	case IRLMP_MAX_SDU_SIZE:
 		val = self->max_data_size;
 		len = sizeof(int);
-		if (put_user(len, optlen))
-			return -EFAULT;
+		if (put_user(len, optlen)) {
+			err = -EFAULT;
+			goto out;
+		}
+
+		if (copy_to_user(optval, &val, len)) {
+			err = -EFAULT;
+			goto out;
+		}
 
-		if (copy_to_user(optval, &val, len))
-			return -EFAULT;
 		break;
 	case IRLMP_IAS_GET:
 		/* The user want an object from our local IAS database.
@@ -2332,17 +2338,22 @@ bed:
 		 * that we found */
 
 		/* Check that the user has allocated the right space for us */
-		if (len != sizeof(struct irda_ias_set))
-			return -EINVAL;
+		if (len != sizeof(struct irda_ias_set)) {
+			err = -EINVAL;
+			goto out;
+		}
 
 		ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
-		if (ias_opt == NULL)
-			return -ENOMEM;
+		if (ias_opt == NULL) {
+			err = -ENOMEM;
+			goto out;
+		}
 
 		/* Copy query to the driver. */
 		if (copy_from_user(ias_opt, optval, len)) {
 			kfree(ias_opt);
-			return -EFAULT;
+			err = -EFAULT;
+			goto out;
 		}
 
 		/* Find the object we target.
@@ -2355,7 +2366,8 @@ bed:
 			ias_obj = irias_find_object(ias_opt->irda_class_name);
 		if(ias_obj == (struct ias_object *) NULL) {
 			kfree(ias_opt);
-			return -EINVAL;
+			err = -EINVAL;
+			goto out;
 		}
 
 		/* Find the attribute (in the object) we target */
@@ -2363,21 +2375,23 @@ bed:
 					     ias_opt->irda_attrib_name);
 		if(ias_attr == (struct ias_attrib *) NULL) {
 			kfree(ias_opt);
-			return -EINVAL;
+			err = -EINVAL;
+			goto out;
 		}
 
 		/* Translate from internal to user structure */
 		err = irda_extract_ias_value(ias_opt, ias_attr->value);
 		if(err) {
 			kfree(ias_opt);
-			return err;
+			goto out;
 		}
 
 		/* Copy reply to the user */
 		if (copy_to_user(optval, ias_opt,
 				 sizeof(struct irda_ias_set))) {
 			kfree(ias_opt);
-			return -EFAULT;
+			err = -EFAULT;
+			goto out;
 		}
 		/* Note : don't need to put optlen, we checked it */
 		kfree(ias_opt);
@@ -2388,17 +2402,22 @@ bed:
 		 * then wait for the answer to come back. */
 
 		/* Check that the user has allocated the right space for us */
-		if (len != sizeof(struct irda_ias_set))
-			return -EINVAL;
+		if (len != sizeof(struct irda_ias_set)) {
+			err = -EINVAL;
+			goto out;
+		}
 
 		ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC);
-		if (ias_opt == NULL)
-			return -ENOMEM;
+		if (ias_opt == NULL) {
+			err = -ENOMEM;
+			goto out;
+		}
 
 		/* Copy query to the driver. */
 		if (copy_from_user(ias_opt, optval, len)) {
 			kfree(ias_opt);
-			return -EFAULT;
+			err = -EFAULT;
+			goto out;
 		}
 
 		/* At this point, there are two cases...
@@ -2419,7 +2438,8 @@ bed:
 			daddr = ias_opt->daddr;
 			if((!daddr) || (daddr == DEV_ADDR_ANY)) {
 				kfree(ias_opt);
-				return -EINVAL;
+				err = -EINVAL;
+				goto out;
 			}
 		}
 
@@ -2428,7 +2448,8 @@ bed:
 			IRDA_WARNING("%s: busy with a previous query\n",
 				     __func__);
 			kfree(ias_opt);
-			return -EBUSY;
+			err = -EBUSY;
+			goto out;
 		}
 
 		self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self,
@@ -2436,7 +2457,8 @@ bed:
 
 		if (self->iriap == NULL) {
 			kfree(ias_opt);
-			return -ENOMEM;
+			err = -ENOMEM;
+			goto out;
 		}
 
 		/* Treat unexpected wakeup as disconnect */
@@ -2455,7 +2477,8 @@ bed:
 			 * we can free it regardless! */
 			kfree(ias_opt);
 			/* Treat signals as disconnect */
-			return -EHOSTUNREACH;
+			err = -EHOSTUNREACH;
+			goto out;
 		}
 
 		/* Check what happened */
@@ -2465,9 +2488,11 @@ bed:
 			/* Requested object/attribute doesn't exist */
 			if((self->errno == IAS_CLASS_UNKNOWN) ||
 			   (self->errno == IAS_ATTRIB_UNKNOWN))
-				return (-EADDRNOTAVAIL);
+				err = -EADDRNOTAVAIL;
 			else
-				return (-EHOSTUNREACH);
+				err = -EHOSTUNREACH;
+
+			goto out;
 		}
 
 		/* Translate from internal to user structure */
@@ -2476,14 +2501,15 @@ bed:
 			irias_delete_value(self->ias_result);
 		if (err) {
 			kfree(ias_opt);
-			return err;
+			goto out;
 		}
 
 		/* Copy reply to the user */
 		if (copy_to_user(optval, ias_opt,
 				 sizeof(struct irda_ias_set))) {
 			kfree(ias_opt);
-			return -EFAULT;
+			err = -EFAULT;
+			goto out;
 		}
 		/* Note : don't need to put optlen, we checked it */
 		kfree(ias_opt);
@@ -2504,11 +2530,15 @@ bed:
 		 */
 
 		/* Check that the user is passing us an int */
-		if (len != sizeof(int))
-			return -EINVAL;
+		if (len != sizeof(int)) {
+			err = -EINVAL;
+			goto out;
+		}
 		/* Get timeout in ms (max time we block the caller) */
-		if (get_user(val, (int __user *)optval))
-			return -EFAULT;
+		if (get_user(val, (int __user *)optval)) {
+			err = -EFAULT;
+			goto out;
+		}
 
 		/* Tell IrLMP we want to be notified */
 		irlmp_update_client(self->ckey, self->mask.word,
@@ -2520,8 +2550,6 @@ bed:
 
 		/* Wait until a node is discovered */
 		if (!self->cachedaddr) {
-			int ret = 0;
-
 			IRDA_DEBUG(1, "%s(), nothing discovered yet, going to sleep...\n", __func__);
 
 			/* Set watchdog timer to expire in <val> ms. */
@@ -2534,7 +2562,7 @@ bed:
 			/* Wait for IR-LMP to call us back */
 			__wait_event_interruptible(self->query_wait,
 			      (self->cachedaddr != 0 || self->errno == -ETIME),
-						   ret);
+						   err);
 
 			/* If watchdog is still activated, kill it! */
 			if(timer_pending(&(self->watchdog)))
@@ -2542,8 +2570,8 @@ bed:
 
 			IRDA_DEBUG(1, "%s(), ...waking up !\n", __func__);
 
-			if (ret != 0)
-				return ret;
+			if (err != 0)
+				goto out;
 		}
 		else
 			IRDA_DEBUG(1, "%s(), found immediately !\n",
@@ -2566,25 +2594,19 @@ bed:
 		 * If the user want more details, he should query
 		 * the whole discovery log and pick one device...
 		 */
-		if (put_user(daddr, (int __user *)optval))
-			return -EFAULT;
+		if (put_user(daddr, (int __user *)optval)) {
+			err = -EFAULT;
+			goto out;
+		}
 
 		break;
 	default:
-		return -ENOPROTOOPT;
+		err = -ENOPROTOOPT;
 	}
 
-	return 0;
-}
-
-static int irda_getsockopt(struct socket *sock, int level, int optname,
-			   char __user *optval, int __user *optlen)
-{
-	int err;
+out:
 
-	lock_kernel();
-	err = __irda_getsockopt(sock, level, optname, optval, optlen);
-	unlock_kernel();
+	release_sock(sk);
 
 	return err;
 }
@@ -2628,7 +2650,7 @@ static const struct proto_ops irda_seqpacket_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	irda_accept,
 	.getname =	irda_getname,
-	.poll =		irda_datagram_poll,
+	.poll =		datagram_poll,
 	.ioctl =	irda_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl =	irda_compat_ioctl,
@@ -2652,7 +2674,7 @@ static const struct proto_ops irda_dgram_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	irda_accept,
 	.getname =	irda_getname,
-	.poll =		irda_datagram_poll,
+	.poll =		datagram_poll,
 	.ioctl =	irda_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl =	irda_compat_ioctl,
@@ -2677,7 +2699,7 @@ static const struct proto_ops irda_ultra_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	irda_getname,
-	.poll =		irda_datagram_poll,
+	.poll =		datagram_poll,
 	.ioctl =	irda_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl =	irda_compat_ioctl,
diff --git a/net/irda/discovery.c b/net/irda/discovery.c
index c1c8ae93912..36c3f037f17 100644
--- a/net/irda/discovery.c
+++ b/net/irda/discovery.c
@@ -315,7 +315,7 @@ struct irda_device_info *irlmp_copy_discoveries(hashbin_t *log, int *pn,
 
 	/* Get the actual number of device in the buffer and return */
 	*pn = i;
-	return(buffer);
+	return buffer;
 }
 
 #ifdef CONFIG_PROC_FS
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index faa82ca2dfd..a39cca8331d 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -449,8 +449,8 @@ static int ircomm_tty_open(struct tty_struct *tty, struct file *filp)
 		}
 
 #ifdef SERIAL_DO_RESTART
-		return ((self->flags & ASYNC_HUP_NOTIFY) ?
-			-EAGAIN : -ERESTARTSYS);
+		return (self->flags & ASYNC_HUP_NOTIFY) ?
+			-EAGAIN : -ERESTARTSYS;
 #else
 		return -EAGAIN;
 #endif
diff --git a/net/irda/iriap.c b/net/irda/iriap.c
index fce364c6c71..5b743bdd89b 100644
--- a/net/irda/iriap.c
+++ b/net/irda/iriap.c
@@ -502,7 +502,8 @@ static void iriap_getvaluebyclass_confirm(struct iriap_cb *self,
 		IRDA_DEBUG(4, "%s(), strlen=%d\n", __func__, value_len);
 
 		/* Make sure the string is null-terminated */
-		fp[n+value_len] = 0x00;
+		if (n + value_len < skb->len)
+			fp[n + value_len] = 0x00;
 		IRDA_DEBUG(4, "Got string %s\n", fp+n);
 
 		/* Will truncate to IAS_MAX_STRING bytes */
diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c
index a788f9e9427..6130f9d9dbe 100644
--- a/net/irda/irlan/irlan_common.c
+++ b/net/irda/irlan/irlan_common.c
@@ -1102,7 +1102,7 @@ int irlan_extract_param(__u8 *buf, char *name, char *value, __u16 *len)
 	memcpy(&val_len, buf+n, 2); /* To avoid alignment problems */
 	le16_to_cpus(&val_len); n+=2;
 
-	if (val_len > 1016) {
+	if (val_len >= 1016) {
 		IRDA_DEBUG(2, "%s(), parameter length to long\n", __func__ );
 		return -RSP_INVALID_COMMAND_FORMAT;
 	}
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index 9616c32d107..8ee1ff6c742 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -45,13 +45,11 @@ static int  irlan_eth_close(struct net_device *dev);
 static netdev_tx_t  irlan_eth_xmit(struct sk_buff *skb,
 					 struct net_device *dev);
 static void irlan_eth_set_multicast_list( struct net_device *dev);
-static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev);
 
 static const struct net_device_ops irlan_eth_netdev_ops = {
 	.ndo_open               = irlan_eth_open,
 	.ndo_stop               = irlan_eth_close,
 	.ndo_start_xmit    	= irlan_eth_xmit,
-	.ndo_get_stats	        = irlan_eth_get_stats,
 	.ndo_set_multicast_list = irlan_eth_set_multicast_list,
 	.ndo_change_mtu		= eth_change_mtu,
 	.ndo_validate_addr	= eth_validate_addr,
@@ -169,6 +167,7 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
 {
 	struct irlan_cb *self = netdev_priv(dev);
 	int ret;
+	unsigned int len;
 
 	/* skb headroom large enough to contain all IrDA-headers? */
 	if ((skb_headroom(skb) < self->max_header_size) || (skb_shared(skb))) {
@@ -188,6 +187,7 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
 
 	dev->trans_start = jiffies;
 
+	len = skb->len;
 	/* Now queue the packet in the transport layer */
 	if (self->use_udata)
 		ret = irttp_udata_request(self->tsap_data, skb);
@@ -206,10 +206,10 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
 		 * tried :-) DB
 		 */
 		/* irttp_data_request already free the packet */
-		self->stats.tx_dropped++;
+		dev->stats.tx_dropped++;
 	} else {
-		self->stats.tx_packets++;
-		self->stats.tx_bytes += skb->len;
+		dev->stats.tx_packets++;
+		dev->stats.tx_bytes += len;
 	}
 
 	return NETDEV_TX_OK;
@@ -224,15 +224,16 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb,
 int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb)
 {
 	struct irlan_cb *self = instance;
+	struct net_device *dev = self->dev;
 
 	if (skb == NULL) {
-		++self->stats.rx_dropped;
+		dev->stats.rx_dropped++;
 		return 0;
 	}
 	if (skb->len < ETH_HLEN) {
 		IRDA_DEBUG(0, "%s() : IrLAN frame too short (%d)\n",
 			   __func__, skb->len);
-		++self->stats.rx_dropped;
+		dev->stats.rx_dropped++;
 		dev_kfree_skb(skb);
 		return 0;
 	}
@@ -242,10 +243,10 @@ int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb)
 	 * might have been previously set by the low level IrDA network
 	 * device driver
 	 */
-	skb->protocol = eth_type_trans(skb, self->dev); /* Remove eth header */
+	skb->protocol = eth_type_trans(skb, dev); /* Remove eth header */
 
-	self->stats.rx_packets++;
-	self->stats.rx_bytes += skb->len;
+	dev->stats.rx_packets++;
+	dev->stats.rx_bytes += skb->len;
 
 	netif_rx(skb);   /* Eat it! */
 
@@ -346,16 +347,3 @@ static void irlan_eth_set_multicast_list(struct net_device *dev)
 	else
 		irlan_set_broadcast_filter(self, FALSE);
 }
-
-/*
- * Function irlan_get_stats (dev)
- *
- *    Get the current statistics for this device
- *
- */
-static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev)
-{
-	struct irlan_cb *self = netdev_priv(dev);
-
-	return &self->stats;
-}
diff --git a/net/irda/irlan/irlan_event.c b/net/irda/irlan/irlan_event.c
index cbcb4eb5403..43f16040a6f 100644
--- a/net/irda/irlan/irlan_event.c
+++ b/net/irda/irlan/irlan_event.c
@@ -24,7 +24,7 @@
 
 #include <net/irda/irlan_event.h>
 
-char *irlan_state[] = {
+const char * const irlan_state[] = {
 	"IRLAN_IDLE",
 	"IRLAN_QUERY",
 	"IRLAN_CONN",
diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c
index 0e7d8bde145..6115a44c0a2 100644
--- a/net/irda/irlmp.c
+++ b/net/irda/irlmp.c
@@ -939,7 +939,7 @@ struct irda_device_info *irlmp_get_discoveries(int *pn, __u16 mask, int nslots)
 	}
 
 	/* Return current cached discovery log */
-	return(irlmp_copy_discoveries(irlmp->cachelog, pn, mask, TRUE));
+	return irlmp_copy_discoveries(irlmp->cachelog, pn, mask, TRUE);
 }
 EXPORT_SYMBOL(irlmp_get_discoveries);
 
diff --git a/net/irda/irlmp_frame.c b/net/irda/irlmp_frame.c
index 3750884094d..062e63b1c5c 100644
--- a/net/irda/irlmp_frame.c
+++ b/net/irda/irlmp_frame.c
@@ -448,7 +448,7 @@ static struct lsap_cb *irlmp_find_lsap(struct lap_cb *self, __u8 dlsap_sel,
 	    (self->cache.slsap_sel == slsap_sel) &&
 	    (self->cache.dlsap_sel == dlsap_sel))
 	{
-		return (self->cache.lsap);
+		return self->cache.lsap;
 	}
 #endif
 
diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h
index 4300df35d37..0d82ff5aeff 100644
--- a/net/irda/irnet/irnet.h
+++ b/net/irda/irnet/irnet.h
@@ -458,6 +458,8 @@ typedef struct irnet_socket
   int			disco_index;	/* Last read in the discovery log */
   int			disco_number;	/* Size of the discovery log */
 
+  struct mutex		lock;
+
 } irnet_socket;
 
 /*
diff --git a/net/irda/irnet/irnet_irda.c b/net/irda/irnet/irnet_irda.c
index e98e40d76f4..7f17a8020e8 100644
--- a/net/irda/irnet/irnet_irda.c
+++ b/net/irda/irnet/irnet_irda.c
@@ -238,7 +238,7 @@ irnet_ias_to_tsap(irnet_socket *	self,
   DEXIT(IRDA_SR_TRACE, "\n");
 
   /* Return the TSAP */
-  return(dtsap_sel);
+  return dtsap_sel;
 }
 
 /*------------------------------------------------------------------*/
@@ -301,7 +301,7 @@ irnet_connect_tsap(irnet_socket *	self)
     {
       clear_bit(0, &self->ttp_connect);
       DERROR(IRDA_SR_ERROR, "connect aborted!\n");
-      return(err);
+      return err;
     }
 
   /* Connect to remote device */
@@ -312,7 +312,7 @@ irnet_connect_tsap(irnet_socket *	self)
     {
       clear_bit(0, &self->ttp_connect);
       DERROR(IRDA_SR_ERROR, "connect aborted!\n");
-      return(err);
+      return err;
     }
 
   /* The above call is non-blocking.
@@ -321,7 +321,7 @@ irnet_connect_tsap(irnet_socket *	self)
    * See you there ;-) */
 
   DEXIT(IRDA_SR_TRACE, "\n");
-  return(err);
+  return err;
 }
 
 /*------------------------------------------------------------------*/
@@ -362,10 +362,10 @@ irnet_discover_next_daddr(irnet_socket *	self)
       /* The above request is non-blocking.
        * After a while, IrDA will call us back in irnet_discovervalue_confirm()
        * We will then call irnet_ias_to_tsap() and come back here again... */
-      return(0);
+      return 0;
     }
   else
-    return(1);
+    return 1;
 }
 
 /*------------------------------------------------------------------*/
@@ -436,7 +436,7 @@ irnet_discover_daddr_and_lsap_sel(irnet_socket *	self)
   /* Follow me in irnet_discovervalue_confirm() */
 
   DEXIT(IRDA_SR_TRACE, "\n");
-  return(0);
+  return 0;
 }
 
 /*------------------------------------------------------------------*/
@@ -485,7 +485,7 @@ irnet_dname_to_daddr(irnet_socket *	self)
   /* No luck ! */
   DEBUG(IRDA_SR_INFO, "cannot discover device ``%s'' !!!\n", self->rname);
   kfree(discoveries);
-  return(-EADDRNOTAVAIL);
+  return -EADDRNOTAVAIL;
 }
 
 
@@ -527,7 +527,7 @@ irda_irnet_create(irnet_socket *	self)
   INIT_WORK(&self->disconnect_work, irnet_ppp_disconnect);
 
   DEXIT(IRDA_SOCK_TRACE, "\n");
-  return(0);
+  return 0;
 }
 
 /*------------------------------------------------------------------*/
@@ -601,7 +601,7 @@ irda_irnet_connect(irnet_socket *	self)
    * We will finish the connection procedure in irnet_connect_tsap().
    */
   DEXIT(IRDA_SOCK_TRACE, "\n");
-  return(0);
+  return 0;
 }
 
 /*------------------------------------------------------------------*/
@@ -733,7 +733,7 @@ irnet_daddr_to_dname(irnet_socket *	self)
   /* No luck ! */
   DEXIT(IRDA_SERV_INFO, ": cannot discover device 0x%08x !!!\n", self->daddr);
   kfree(discoveries);
-  return(-EADDRNOTAVAIL);
+  return -EADDRNOTAVAIL;
 }
 
 /*------------------------------------------------------------------*/
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index 6a1a202710c..7fa86373de4 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -20,7 +20,7 @@
 /* Please put other headers in irnet.h - Thanks */
 
 /* Generic PPP callbacks (to call us) */
-static struct ppp_channel_ops irnet_ppp_ops = {
+static const struct ppp_channel_ops irnet_ppp_ops = {
 	.start_xmit = ppp_irnet_send,
 	.ioctl = ppp_irnet_ioctl
 };
@@ -166,7 +166,7 @@ irnet_ctrl_write(irnet_socket *	ap,
     }
 
   /* Success : we have parsed all commands successfully */
-  return(count);
+  return count;
 }
 
 #ifdef INITIAL_DISCOVERY
@@ -300,7 +300,7 @@ irnet_ctrl_read(irnet_socket *	ap,
 	}
 
       DEXIT(CTRL_TRACE, "\n");
-      return(strlen(event));
+      return strlen(event);
     }
 #endif /* INITIAL_DISCOVERY */
 
@@ -409,7 +409,7 @@ irnet_ctrl_read(irnet_socket *	ap,
     }
 
   DEXIT(CTRL_TRACE, "\n");
-  return(strlen(event));
+  return strlen(event);
 }
 
 /*------------------------------------------------------------------*/
@@ -480,7 +480,6 @@ dev_irnet_open(struct inode *	inode,
   ap = kzalloc(sizeof(*ap), GFP_KERNEL);
   DABORT(ap == NULL, -ENOMEM, FS_ERROR, "Can't allocate struct irnet...\n");
 
-  lock_kernel();
   /* initialize the irnet structure */
   ap->file = file;
 
@@ -502,18 +501,20 @@ dev_irnet_open(struct inode *	inode,
     {
       DERROR(FS_ERROR, "Can't setup IrDA link...\n");
       kfree(ap);
-      unlock_kernel();
+
       return err;
     }
 
   /* For the control channel */
   ap->event_index = irnet_events.index;	/* Cancel all past events */
 
+  mutex_init(&ap->lock);
+
   /* Put our stuff where we will be able to find it later */
   file->private_data = ap;
 
   DEXIT(FS_TRACE, " - ap=0x%p\n", ap);
-  unlock_kernel();
+
   return 0;
 }
 
@@ -527,7 +528,7 @@ static int
 dev_irnet_close(struct inode *	inode,
 		struct file *	file)
 {
-  irnet_socket *	ap = (struct irnet_socket *) file->private_data;
+  irnet_socket *	ap = file->private_data;
 
   DENTER(FS_TRACE, "(file=0x%p, ap=0x%p)\n",
 	 file, ap);
@@ -564,7 +565,7 @@ dev_irnet_write(struct file *	file,
 		size_t		count,
 		loff_t *	ppos)
 {
-  irnet_socket *	ap = (struct irnet_socket *) file->private_data;
+  irnet_socket *	ap = file->private_data;
 
   DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n",
 	file, ap, count);
@@ -588,7 +589,7 @@ dev_irnet_read(struct file *	file,
 	       size_t		count,
 	       loff_t *		ppos)
 {
-  irnet_socket *	ap = (struct irnet_socket *) file->private_data;
+  irnet_socket *	ap = file->private_data;
 
   DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n",
 	file, ap, count);
@@ -609,7 +610,7 @@ static unsigned int
 dev_irnet_poll(struct file *	file,
 	       poll_table *	wait)
 {
-  irnet_socket *	ap = (struct irnet_socket *) file->private_data;
+  irnet_socket *	ap = file->private_data;
   unsigned int		mask;
 
   DENTER(FS_TRACE, "(file=0x%p, ap=0x%p)\n",
@@ -623,7 +624,7 @@ dev_irnet_poll(struct file *	file,
     mask |= irnet_ctrl_poll(ap, file, wait);
 
   DEXIT(FS_TRACE, " - mask=0x%X\n", mask);
-  return(mask);
+  return mask;
 }
 
 /*------------------------------------------------------------------*/
@@ -638,7 +639,7 @@ dev_irnet_ioctl(
 		unsigned int	cmd,
 		unsigned long	arg)
 {
-  irnet_socket *	ap = (struct irnet_socket *) file->private_data;
+  irnet_socket *	ap = file->private_data;
   int			err;
   int			val;
   void __user *argp = (void __user *)arg;
@@ -663,8 +664,10 @@ dev_irnet_ioctl(
       if((val == N_SYNC_PPP) || (val == N_PPP))
 	{
 	  DEBUG(FS_INFO, "Entering PPP discipline.\n");
-	  /* PPP channel setup (ap->chan in configued in dev_irnet_open())*/
-	  lock_kernel();
+	  /* PPP channel setup (ap->chan in configured in dev_irnet_open())*/
+	  if (mutex_lock_interruptible(&ap->lock))
+		  return -EINTR;
+
 	  err = ppp_register_channel(&ap->chan);
 	  if(err == 0)
 	    {
@@ -677,14 +680,17 @@ dev_irnet_ioctl(
 	    }
 	  else
 	    DERROR(FS_ERROR, "Can't setup PPP channel...\n");
-          unlock_kernel();
+
+          mutex_unlock(&ap->lock);
 	}
       else
 	{
 	  /* In theory, should be N_TTY */
 	  DEBUG(FS_INFO, "Exiting PPP discipline.\n");
 	  /* Disconnect from the generic PPP layer */
-	  lock_kernel();
+	  if (mutex_lock_interruptible(&ap->lock))
+		  return -EINTR;
+
 	  if(ap->ppp_open)
 	    {
 	      ap->ppp_open = 0;
@@ -693,24 +699,31 @@ dev_irnet_ioctl(
 	  else
 	    DERROR(FS_ERROR, "Channel not registered !\n");
 	  err = 0;
-	  unlock_kernel();
+
+	  mutex_unlock(&ap->lock);
 	}
       break;
 
       /* Query PPP channel and unit number */
     case PPPIOCGCHAN:
-      lock_kernel();
+      if (mutex_lock_interruptible(&ap->lock))
+	      return -EINTR;
+
       if(ap->ppp_open && !put_user(ppp_channel_index(&ap->chan),
 						(int __user *)argp))
 	err = 0;
-      unlock_kernel();
+
+      mutex_unlock(&ap->lock);
       break;
     case PPPIOCGUNIT:
-      lock_kernel();
+      if (mutex_lock_interruptible(&ap->lock))
+	      return -EINTR;
+
       if(ap->ppp_open && !put_user(ppp_unit_number(&ap->chan),
 						(int __user *)argp))
         err = 0;
-      unlock_kernel();
+
+      mutex_unlock(&ap->lock);
       break;
 
       /* All these ioctls can be passed both directly and from ppp_generic,
@@ -730,9 +743,12 @@ dev_irnet_ioctl(
       if(!capable(CAP_NET_ADMIN))
 	err = -EPERM;
       else {
-	lock_kernel();
+	if (mutex_lock_interruptible(&ap->lock))
+	      return -EINTR;
+
 	err = ppp_irnet_ioctl(&ap->chan, cmd, arg);
-	unlock_kernel();
+
+	mutex_unlock(&ap->lock);
       }
       break;
 
@@ -740,7 +756,9 @@ dev_irnet_ioctl(
       /* Get termios */
     case TCGETS:
       DEBUG(FS_INFO, "Get termios.\n");
-      lock_kernel();
+      if (mutex_lock_interruptible(&ap->lock))
+	      return -EINTR;
+
 #ifndef TCGETS2
       if(!kernel_termios_to_user_termios((struct termios __user *)argp, &ap->termios))
 	err = 0;
@@ -748,12 +766,15 @@ dev_irnet_ioctl(
       if(kernel_termios_to_user_termios_1((struct termios __user *)argp, &ap->termios))
 	err = 0;
 #endif
-      unlock_kernel();
+
+      mutex_unlock(&ap->lock);
       break;
       /* Set termios */
     case TCSETSF:
       DEBUG(FS_INFO, "Set termios.\n");
-      lock_kernel();
+      if (mutex_lock_interruptible(&ap->lock))
+	      return -EINTR;
+
 #ifndef TCGETS2
       if(!user_termios_to_kernel_termios(&ap->termios, (struct termios __user *)argp))
 	err = 0;
@@ -761,7 +782,8 @@ dev_irnet_ioctl(
       if(!user_termios_to_kernel_termios_1(&ap->termios, (struct termios __user *)argp))
 	err = 0;
 #endif
-      unlock_kernel();
+
+      mutex_unlock(&ap->lock);
       break;
 
       /* Set DTR/RTS */
@@ -784,9 +806,10 @@ dev_irnet_ioctl(
        * We should also worry that we don't accept junk here and that
        * we get rid of our own buffers */
 #ifdef FLUSH_TO_PPP
-      lock_kernel();
+      if (mutex_lock_interruptible(&ap->lock))
+	      return -EINTR;
       ppp_output_wakeup(&ap->chan);
-      unlock_kernel();
+      mutex_unlock(&ap->lock);
 #endif /* FLUSH_TO_PPP */
       err = 0;
       break;
diff --git a/net/irda/irnet/irnet_ppp.h b/net/irda/irnet/irnet_ppp.h
index b5df2418f90..940225866da 100644
--- a/net/irda/irnet/irnet_ppp.h
+++ b/net/irda/irnet/irnet_ppp.h
@@ -103,7 +103,8 @@ static const struct file_operations irnet_device_fops =
 	.poll		= dev_irnet_poll,
 	.unlocked_ioctl	= dev_irnet_ioctl,
 	.open		= dev_irnet_open,
-	.release	= dev_irnet_close
+	.release	= dev_irnet_close,
+	.llseek		= noop_llseek,
   /* Also : llseek, readdir, mmap, flush, fsync, fasync, lock, readv, writev */
 };
 
diff --git a/net/irda/irttp.c b/net/irda/irttp.c
index 47db1d8a0d9..285761e77d9 100644
--- a/net/irda/irttp.c
+++ b/net/irda/irttp.c
@@ -1853,23 +1853,23 @@ static int irttp_seq_show(struct seq_file *seq, void *v)
 		   self->remote_credit);
 	seq_printf(seq, "send credit: %d\n",
 		   self->send_credit);
-	seq_printf(seq, "  tx packets: %ld, ",
+	seq_printf(seq, "  tx packets: %lu, ",
 		   self->stats.tx_packets);
-	seq_printf(seq, "rx packets: %ld, ",
+	seq_printf(seq, "rx packets: %lu, ",
 		   self->stats.rx_packets);
-	seq_printf(seq, "tx_queue len: %d ",
+	seq_printf(seq, "tx_queue len: %u ",
 		   skb_queue_len(&self->tx_queue));
-	seq_printf(seq, "rx_queue len: %d\n",
+	seq_printf(seq, "rx_queue len: %u\n",
 		   skb_queue_len(&self->rx_queue));
 	seq_printf(seq, "  tx_sdu_busy: %s, ",
 		   self->tx_sdu_busy? "TRUE":"FALSE");
 	seq_printf(seq, "rx_sdu_busy: %s\n",
 		   self->rx_sdu_busy? "TRUE":"FALSE");
-	seq_printf(seq, "  max_seg_size: %d, ",
+	seq_printf(seq, "  max_seg_size: %u, ",
 		   self->max_seg_size);
-	seq_printf(seq, "tx_max_sdu_size: %d, ",
+	seq_printf(seq, "tx_max_sdu_size: %u, ",
 		   self->tx_max_sdu_size);
-	seq_printf(seq, "rx_max_sdu_size: %d\n",
+	seq_printf(seq, "rx_max_sdu_size: %u\n",
 		   self->rx_max_sdu_size);
 
 	seq_printf(seq, "  Used by (%s)\n\n",
diff --git a/net/irda/parameters.c b/net/irda/parameters.c
index fc1a20565e2..71cd38c1a67 100644
--- a/net/irda/parameters.c
+++ b/net/irda/parameters.c
@@ -298,6 +298,8 @@ static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi,
 
 	p.pi = pi;     /* In case handler needs to know */
 	p.pl = buf[1]; /* Extract length of value */
+	if (p.pl > 32)
+		p.pl = 32;
 
 	IRDA_DEBUG(2, "%s(), pi=%#x, pl=%d\n", __func__,
 		   p.pi, p.pl);
@@ -318,7 +320,7 @@ static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi,
 		   (__u8) str[0], (__u8) str[1]);
 
 	/* Null terminate string */
-	str[p.pl+1] = '\0';
+	str[p.pl] = '\0';
 
 	p.pv.c = str; /* Handler will need to take a copy */
 
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index f28ad2cc842..f7db676de77 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -1463,7 +1463,7 @@ struct iucv_path_pending {
 	u32 res3;
 	u8  ippollfg;
 	u8  res4[3];
-} __attribute__ ((packed));
+} __packed;
 
 static void iucv_path_pending(struct iucv_irq_data *data)
 {
@@ -1524,7 +1524,7 @@ struct iucv_path_complete {
 	u32 res3;
 	u8  ippollfg;
 	u8  res4[3];
-} __attribute__ ((packed));
+} __packed;
 
 static void iucv_path_complete(struct iucv_irq_data *data)
 {
@@ -1554,7 +1554,7 @@ struct iucv_path_severed {
 	u32 res4;
 	u8  ippollfg;
 	u8  res5[3];
-} __attribute__ ((packed));
+} __packed;
 
 static void iucv_path_severed(struct iucv_irq_data *data)
 {
@@ -1590,7 +1590,7 @@ struct iucv_path_quiesced {
 	u32 res4;
 	u8  ippollfg;
 	u8  res5[3];
-} __attribute__ ((packed));
+} __packed;
 
 static void iucv_path_quiesced(struct iucv_irq_data *data)
 {
@@ -1618,7 +1618,7 @@ struct iucv_path_resumed {
 	u32 res4;
 	u8  ippollfg;
 	u8  res5[3];
-} __attribute__ ((packed));
+} __packed;
 
 static void iucv_path_resumed(struct iucv_irq_data *data)
 {
@@ -1649,7 +1649,7 @@ struct iucv_message_complete {
 	u32 ipbfln2f;
 	u8  ippollfg;
 	u8  res2[3];
-} __attribute__ ((packed));
+} __packed;
 
 static void iucv_message_complete(struct iucv_irq_data *data)
 {
@@ -1694,7 +1694,7 @@ struct iucv_message_pending {
 	u32 ipbfln2f;
 	u8  ippollfg;
 	u8  res2[3];
-} __attribute__ ((packed));
+} __packed;
 
 static void iucv_message_pending(struct iucv_irq_data *data)
 {
@@ -1798,7 +1798,8 @@ static void iucv_work_fn(struct work_struct *work)
  * Handles external interrupts coming in from CP.
  * Places the interrupt buffer on a queue and schedules iucv_tasklet_fn().
  */
-static void iucv_external_interrupt(u16 code)
+static void iucv_external_interrupt(unsigned int ext_int_code,
+				    unsigned int param32, unsigned long param64)
 {
 	struct iucv_irq_data *p;
 	struct iucv_irq_list *work;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 43040e97c47..d87c22df6f1 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -565,12 +565,12 @@ pfkey_proto2satype(uint16_t proto)
 
 static uint8_t pfkey_proto_to_xfrm(uint8_t proto)
 {
-	return (proto == IPSEC_PROTO_ANY ? 0 : proto);
+	return proto == IPSEC_PROTO_ANY ? 0 : proto;
 }
 
 static uint8_t pfkey_proto_from_xfrm(uint8_t proto)
 {
-	return (proto ? proto : IPSEC_PROTO_ANY);
+	return proto ? proto : IPSEC_PROTO_ANY;
 }
 
 static inline int pfkey_sockaddr_len(sa_family_t family)
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 1712af1c7b3..c64ce0a0bb0 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -111,6 +111,10 @@ struct l2tp_net {
 	spinlock_t l2tp_session_hlist_lock;
 };
 
+static void l2tp_session_set_header_len(struct l2tp_session *session, int version);
+static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel);
+static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel);
+
 static inline struct l2tp_net *l2tp_pernet(struct net *net)
 {
 	BUG_ON(!net);
@@ -118,6 +122,34 @@ static inline struct l2tp_net *l2tp_pernet(struct net *net)
 	return net_generic(net, l2tp_net_id);
 }
 
+
+/* Tunnel reference counts. Incremented per session that is added to
+ * the tunnel.
+ */
+static inline void l2tp_tunnel_inc_refcount_1(struct l2tp_tunnel *tunnel)
+{
+	atomic_inc(&tunnel->ref_count);
+}
+
+static inline void l2tp_tunnel_dec_refcount_1(struct l2tp_tunnel *tunnel)
+{
+	if (atomic_dec_and_test(&tunnel->ref_count))
+		l2tp_tunnel_free(tunnel);
+}
+#ifdef L2TP_REFCNT_DEBUG
+#define l2tp_tunnel_inc_refcount(_t) do { \
+		printk(KERN_DEBUG "l2tp_tunnel_inc_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \
+		l2tp_tunnel_inc_refcount_1(_t);				\
+	} while (0)
+#define l2tp_tunnel_dec_refcount(_t) do { \
+		printk(KERN_DEBUG "l2tp_tunnel_dec_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \
+		l2tp_tunnel_dec_refcount_1(_t);				\
+	} while (0)
+#else
+#define l2tp_tunnel_inc_refcount(t) l2tp_tunnel_inc_refcount_1(t)
+#define l2tp_tunnel_dec_refcount(t) l2tp_tunnel_dec_refcount_1(t)
+#endif
+
 /* Session hash global list for L2TPv3.
  * The session_id SHOULD be random according to RFC3931, but several
  * L2TP implementations use incrementing session_ids.  So we do a real
@@ -699,8 +731,8 @@ EXPORT_SYMBOL(l2tp_recv_common);
  * Returns 1 if the packet was not a good data packet and could not be
  * forwarded.  All such packets are passed up to userspace to deal with.
  */
-int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
-		       int (*payload_hook)(struct sk_buff *skb))
+static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
+			      int (*payload_hook)(struct sk_buff *skb))
 {
 	struct l2tp_session *session = NULL;
 	unsigned char *ptr, *optr;
@@ -812,7 +844,6 @@ error:
 
 	return 1;
 }
-EXPORT_SYMBOL_GPL(l2tp_udp_recv_core);
 
 /* UDP encapsulation receive handler. See net/ipv4/udp.c.
  * Return codes:
@@ -922,7 +953,8 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
 	return bufp - optr;
 }
 
-int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, size_t data_len)
+static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
+			  size_t data_len)
 {
 	struct l2tp_tunnel *tunnel = session->tunnel;
 	unsigned int len = skb->len;
@@ -970,7 +1002,6 @@ int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, size_t dat
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(l2tp_xmit_core);
 
 /* Automatically called when the skb is freed.
  */
@@ -1089,7 +1120,7 @@ EXPORT_SYMBOL_GPL(l2tp_xmit_skb);
  * The tunnel context is deleted only when all session sockets have been
  * closed.
  */
-void l2tp_tunnel_destruct(struct sock *sk)
+static void l2tp_tunnel_destruct(struct sock *sk)
 {
 	struct l2tp_tunnel *tunnel;
 
@@ -1128,11 +1159,10 @@ void l2tp_tunnel_destruct(struct sock *sk)
 end:
 	return;
 }
-EXPORT_SYMBOL(l2tp_tunnel_destruct);
 
 /* When the tunnel is closed, all the attached sessions need to go too.
  */
-void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel)
+static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel)
 {
 	int hash;
 	struct hlist_node *walk;
@@ -1193,12 +1223,11 @@ again:
 	}
 	write_unlock_bh(&tunnel->hlist_lock);
 }
-EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall);
 
 /* Really kill the tunnel.
  * Come here only when all sessions have been cleared from the tunnel.
  */
-void l2tp_tunnel_free(struct l2tp_tunnel *tunnel)
+static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel)
 {
 	struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
 
@@ -1217,7 +1246,6 @@ void l2tp_tunnel_free(struct l2tp_tunnel *tunnel)
 	atomic_dec(&l2tp_tunnel_count);
 	kfree(tunnel);
 }
-EXPORT_SYMBOL_GPL(l2tp_tunnel_free);
 
 /* Create a socket for the tunnel, if one isn't set up by
  * userspace. This is used for static tunnels where there is no
@@ -1512,7 +1540,7 @@ EXPORT_SYMBOL_GPL(l2tp_session_delete);
 /* We come here whenever a session's send_seq, cookie_len or
  * l2specific_len parameters are set.
  */
-void l2tp_session_set_header_len(struct l2tp_session *session, int version)
+static void l2tp_session_set_header_len(struct l2tp_session *session, int version)
 {
 	if (version == L2TP_HDR_VER_2) {
 		session->hdr_len = 6;
@@ -1525,7 +1553,6 @@ void l2tp_session_set_header_len(struct l2tp_session *session, int version)
 	}
 
 }
-EXPORT_SYMBOL_GPL(l2tp_session_set_header_len);
 
 struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg)
 {
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index f0f318edd3f..a16a48e79fa 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -231,48 +231,15 @@ extern int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_i
 extern int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel);
 extern struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg);
 extern int l2tp_session_delete(struct l2tp_session *session);
-extern void l2tp_tunnel_free(struct l2tp_tunnel *tunnel);
 extern void l2tp_session_free(struct l2tp_session *session);
 extern void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, unsigned char *ptr, unsigned char *optr, u16 hdrflags, int length, int (*payload_hook)(struct sk_buff *skb));
-extern int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb, int (*payload_hook)(struct sk_buff *skb));
 extern int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb);
 
-extern int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, size_t data_len);
 extern int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len);
-extern void l2tp_tunnel_destruct(struct sock *sk);
-extern void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel);
-extern void l2tp_session_set_header_len(struct l2tp_session *session, int version);
 
 extern int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops);
 extern void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type);
 
-/* Tunnel reference counts. Incremented per session that is added to
- * the tunnel.
- */
-static inline void l2tp_tunnel_inc_refcount_1(struct l2tp_tunnel *tunnel)
-{
-	atomic_inc(&tunnel->ref_count);
-}
-
-static inline void l2tp_tunnel_dec_refcount_1(struct l2tp_tunnel *tunnel)
-{
-	if (atomic_dec_and_test(&tunnel->ref_count))
-		l2tp_tunnel_free(tunnel);
-}
-#ifdef L2TP_REFCNT_DEBUG
-#define l2tp_tunnel_inc_refcount(_t) do { \
-		printk(KERN_DEBUG "l2tp_tunnel_inc_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \
-		l2tp_tunnel_inc_refcount_1(_t);				\
-	} while (0)
-#define l2tp_tunnel_dec_refcount(_t) do { \
-		printk(KERN_DEBUG "l2tp_tunnel_dec_refcount: %s:%d %s: cnt=%d\n", __func__, __LINE__, (_t)->name, atomic_read(&_t->ref_count)); \
-		l2tp_tunnel_dec_refcount_1(_t);				\
-	} while (0)
-#else
-#define l2tp_tunnel_inc_refcount(t) l2tp_tunnel_inc_refcount_1(t)
-#define l2tp_tunnel_dec_refcount(t) l2tp_tunnel_dec_refcount_1(t)
-#endif
-
 /* Session reference counts. Incremented when code obtains a reference
  * to a session.
  */
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 58c6c4cda73..8d9ce0accc9 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -132,7 +132,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb,
 		printk("\n");
 	}
 
-	if (data_len < ETH_HLEN)
+	if (!pskb_may_pull(skb, sizeof(ETH_HLEN)))
 		goto error;
 
 	secpath_reset(skb);
@@ -144,7 +144,6 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb,
 	nf_reset(skb);
 
 	if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) {
-		dev->last_rx = jiffies;
 		dev->stats.rx_packets++;
 		dev->stats.rx_bytes += data_len;
 	} else
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 0852512d392..0bf6a59545a 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -65,9 +65,7 @@ static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif
 			continue;
 
 		if ((l2tp->conn_id == tunnel_id) &&
-#ifdef CONFIG_NET_NS
-		    (sk->sk_net == net) &&
-#endif
+		    net_eq(sock_net(sk), net) &&
 		    !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
 		    !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
 			goto found;
@@ -348,7 +346,7 @@ static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
 	sk->sk_state = TCP_ESTABLISHED;
 	inet->inet_id = jiffies;
 
-	sk_dst_set(sk, &rt->u.dst);
+	sk_dst_set(sk, &rt->dst);
 
 	write_lock_bh(&l2tp_ip_lock);
 	hlist_del_init(&sk->sk_bind_node);
@@ -496,9 +494,9 @@ static int l2tp_ip_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
 			if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
 				goto no_route;
 		}
-		sk_setup_caps(sk, &rt->u.dst);
+		sk_setup_caps(sk, &rt->dst);
 	}
-	skb_dst_set(skb, dst_clone(&rt->u.dst));
+	skb_dst_set(skb, dst_clone(&rt->dst));
 
 	/* Queue the packet to IP for output */
 	rc = ip_queue_xmit(skb);
@@ -578,7 +576,7 @@ out:
 	return copied;
 }
 
-struct proto l2tp_ip_prot = {
+static struct proto l2tp_ip_prot = {
 	.name		   = "L2TP/IP",
 	.owner		   = THIS_MODULE,
 	.init		   = l2tp_ip_open,
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 90d82b3f288..39a21d0c61c 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -135,7 +135,10 @@ struct pppol2tp_session {
 
 static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb);
 
-static struct ppp_channel_ops pppol2tp_chan_ops = { pppol2tp_xmit , NULL };
+static const struct ppp_channel_ops pppol2tp_chan_ops = {
+	.start_xmit =  pppol2tp_xmit,
+};
+
 static const struct proto_ops pppol2tp_ops;
 
 /* Helpers to obtain tunnel/session contexts from sockets.
@@ -1765,7 +1768,7 @@ static const struct proto_ops pppol2tp_ops = {
 	.ioctl		= pppox_ioctl,
 };
 
-static struct pppox_proto pppol2tp_proto = {
+static const struct pppox_proto pppol2tp_proto = {
 	.create		= pppol2tp_create,
 	.ioctl		= pppol2tp_ioctl
 };
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 023ba820236..58261299821 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -1024,7 +1024,8 @@ static int llc_ui_setsockopt(struct socket *sock, int level, int optname,
 {
 	struct sock *sk = sock->sk;
 	struct llc_sock *llc = llc_sk(sk);
-	int rc = -EINVAL, opt;
+	unsigned int opt;
+	int rc = -EINVAL;
 
 	lock_sock(sk);
 	if (unlikely(level != SOL_LLC || optlen != sizeof(int)))
diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c
index e4dae0244d7..cf4aea3ba30 100644
--- a/net/llc/llc_station.c
+++ b/net/llc/llc_station.c
@@ -689,7 +689,7 @@ static void llc_station_rcv(struct sk_buff *skb)
 
 int __init llc_station_init(void)
 {
-	u16 rc = -ENOBUFS;
+	int rc = -ENOBUFS;
 	struct sk_buff *skb;
 	struct llc_station_state_ev *ev;
 
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 8a91f6c0bb1..4d6f8653ec8 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -33,6 +33,13 @@ config MAC80211_RC_MINSTREL
 	---help---
 	  This option enables the 'minstrel' TX rate control algorithm
 
+config MAC80211_RC_MINSTREL_HT
+	bool "Minstrel 802.11n support" if EMBEDDED
+	depends on MAC80211_RC_MINSTREL
+	default y
+	---help---
+	  This option enables the 'minstrel_ht' TX rate control algorithm
+
 choice
 	prompt "Default rate control algorithm"
 	depends on MAC80211_HAS_RC
@@ -62,6 +69,7 @@ endchoice
 
 config MAC80211_RC_DEFAULT
 	string
+	default "minstrel_ht" if MAC80211_RC_DEFAULT_MINSTREL && MAC80211_RC_MINSTREL_HT
 	default "minstrel" if MAC80211_RC_DEFAULT_MINSTREL
 	default "pid" if MAC80211_RC_DEFAULT_PID
 	default ""
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index 84b48ba8a77..fdb54e61d63 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -51,7 +51,11 @@ rc80211_pid-$(CONFIG_MAC80211_DEBUGFS) += rc80211_pid_debugfs.o
 rc80211_minstrel-y := rc80211_minstrel.o
 rc80211_minstrel-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_debugfs.o
 
+rc80211_minstrel_ht-y := rc80211_minstrel_ht.o
+rc80211_minstrel_ht-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_ht_debugfs.o
+
 mac80211-$(CONFIG_MAC80211_RC_PID) += $(rc80211_pid-y)
 mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y)
+mac80211-$(CONFIG_MAC80211_RC_MINSTREL_HT) += $(rc80211_minstrel_ht-y)
 
 ccflags-y += -D__CHECK_ENDIAN__
diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c
index a87cb3ba2df..d2b03e0851e 100644
--- a/net/mac80211/aes_ccm.c
+++ b/net/mac80211/aes_ccm.c
@@ -138,10 +138,8 @@ struct crypto_cipher *ieee80211_aes_key_setup_encrypt(const u8 key[])
 	struct crypto_cipher *tfm;
 
 	tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
-	if (IS_ERR(tfm))
-		return NULL;
-
-	crypto_cipher_setkey(tfm, key, ALG_CCMP_KEY_LEN);
+	if (!IS_ERR(tfm))
+		crypto_cipher_setkey(tfm, key, ALG_CCMP_KEY_LEN);
 
 	return tfm;
 }
diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
index 3d097b3d7b6..b4d66cca76d 100644
--- a/net/mac80211/aes_cmac.c
+++ b/net/mac80211/aes_cmac.c
@@ -119,10 +119,8 @@ struct crypto_cipher * ieee80211_aes_cmac_key_setup(const u8 key[])
 	struct crypto_cipher *tfm;
 
 	tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
-	if (IS_ERR(tfm))
-		return NULL;
-
-	crypto_cipher_setkey(tfm, key, AES_CMAC_KEY_LEN);
+	if (!IS_ERR(tfm))
+		crypto_cipher_setkey(tfm, key, AES_CMAC_KEY_LEN);
 
 	return tfm;
 }
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 6bb9a9a9496..720b7a84af5 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -6,39 +6,70 @@
  * Copyright 2005-2006, Devicescape Software, Inc.
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
- * Copyright 2007-2008, Intel Corporation
+ * Copyright 2007-2010, Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
 
+/**
+ * DOC: RX A-MPDU aggregation
+ *
+ * Aggregation on the RX side requires only implementing the
+ * @ampdu_action callback that is invoked to start/stop any
+ * block-ack sessions for RX aggregation.
+ *
+ * When RX aggregation is started by the peer, the driver is
+ * notified via @ampdu_action function, with the
+ * %IEEE80211_AMPDU_RX_START action, and may reject the request
+ * in which case a negative response is sent to the peer, if it
+ * accepts it a positive response is sent.
+ *
+ * While the session is active, the device/driver are required
+ * to de-aggregate frames and pass them up one by one to mac80211,
+ * which will handle the reorder buffer.
+ *
+ * When the aggregation session is stopped again by the peer or
+ * ourselves, the driver's @ampdu_action function will be called
+ * with the action %IEEE80211_AMPDU_RX_STOP. In this case, the
+ * call must not fail.
+ */
+
 #include <linux/ieee80211.h>
 #include <linux/slab.h>
 #include <net/mac80211.h>
 #include "ieee80211_i.h"
 #include "driver-ops.h"
 
-static void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
-					    u16 initiator, u16 reason,
-					    bool from_timer)
+static void ieee80211_free_tid_rx(struct rcu_head *h)
 {
-	struct ieee80211_local *local = sta->local;
-	struct tid_ampdu_rx *tid_rx;
+	struct tid_ampdu_rx *tid_rx =
+		container_of(h, struct tid_ampdu_rx, rcu_head);
 	int i;
 
-	spin_lock_bh(&sta->lock);
+	for (i = 0; i < tid_rx->buf_size; i++)
+		dev_kfree_skb(tid_rx->reorder_buf[i]);
+	kfree(tid_rx->reorder_buf);
+	kfree(tid_rx->reorder_time);
+	kfree(tid_rx);
+}
 
-	/* check if TID is in operational state */
-	if (!sta->ampdu_mlme.tid_active_rx[tid]) {
-		spin_unlock_bh(&sta->lock);
-		return;
-	}
+void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
+				     u16 initiator, u16 reason, bool tx)
+{
+	struct ieee80211_local *local = sta->local;
+	struct tid_ampdu_rx *tid_rx;
 
-	sta->ampdu_mlme.tid_active_rx[tid] = false;
+	lockdep_assert_held(&sta->ampdu_mlme.mtx);
 
 	tid_rx = sta->ampdu_mlme.tid_rx[tid];
 
+	if (!tid_rx)
+		return;
+
+	rcu_assign_pointer(sta->ampdu_mlme.tid_rx[tid], NULL);
+
 #ifdef CONFIG_MAC80211_HT_DEBUG
 	printk(KERN_DEBUG "Rx BA session stop requested for %pM tid %u\n",
 	       sta->sta.addr, tid);
@@ -50,36 +81,22 @@ static void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
 				"aggregation for tid %d\n", tid);
 
 	/* check if this is a self generated aggregation halt */
-	if (initiator == WLAN_BACK_RECIPIENT)
+	if (initiator == WLAN_BACK_RECIPIENT && tx)
 		ieee80211_send_delba(sta->sdata, sta->sta.addr,
 				     tid, 0, reason);
 
-	/* free the reordering buffer */
-	for (i = 0; i < tid_rx->buf_size; i++) {
-		if (tid_rx->reorder_buf[i]) {
-			/* release the reordered frames */
-			dev_kfree_skb(tid_rx->reorder_buf[i]);
-			tid_rx->stored_mpdu_num--;
-			tid_rx->reorder_buf[i] = NULL;
-		}
-	}
-
-	/* free resources */
-	kfree(tid_rx->reorder_buf);
-	kfree(tid_rx->reorder_time);
-	sta->ampdu_mlme.tid_rx[tid] = NULL;
-
-	spin_unlock_bh(&sta->lock);
+	del_timer_sync(&tid_rx->session_timer);
+	del_timer_sync(&tid_rx->reorder_timer);
 
-	if (!from_timer)
-		del_timer_sync(&tid_rx->session_timer);
-	kfree(tid_rx);
+	call_rcu(&tid_rx->rcu_head, ieee80211_free_tid_rx);
 }
 
 void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
-				    u16 initiator, u16 reason)
+				    u16 initiator, u16 reason, bool tx)
 {
-	___ieee80211_stop_rx_ba_session(sta, tid, initiator, reason, false);
+	mutex_lock(&sta->ampdu_mlme.mtx);
+	___ieee80211_stop_rx_ba_session(sta, tid, initiator, reason, tx);
+	mutex_unlock(&sta->ampdu_mlme.mtx);
 }
 
 /*
@@ -100,8 +117,22 @@ static void sta_rx_agg_session_timer_expired(unsigned long data)
 #ifdef CONFIG_MAC80211_HT_DEBUG
 	printk(KERN_DEBUG "rx session timer expired on tid %d\n", (u16)*ptid);
 #endif
-	___ieee80211_stop_rx_ba_session(sta, *ptid, WLAN_BACK_RECIPIENT,
-					WLAN_REASON_QSTA_TIMEOUT, true);
+	set_bit(*ptid, sta->ampdu_mlme.tid_rx_timer_expired);
+	ieee80211_queue_work(&sta->local->hw, &sta->ampdu_mlme.work);
+}
+
+static void sta_rx_agg_reorder_timer_expired(unsigned long data)
+{
+	u8 *ptid = (u8 *)data;
+	u8 *timer_to_id = ptid - *ptid;
+	struct sta_info *sta = container_of(timer_to_id, struct sta_info,
+			timer_to_tid[0]);
+
+	rcu_read_lock();
+	spin_lock(&sta->lock);
+	ieee80211_release_reorder_timeout(sta, *ptid);
+	spin_unlock(&sta->lock);
+	rcu_read_unlock();
 }
 
 static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid,
@@ -212,9 +243,9 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
 
 
 	/* examine state machine */
-	spin_lock_bh(&sta->lock);
+	mutex_lock(&sta->ampdu_mlme.mtx);
 
-	if (sta->ampdu_mlme.tid_active_rx[tid]) {
+	if (sta->ampdu_mlme.tid_rx[tid]) {
 #ifdef CONFIG_MAC80211_HT_DEBUG
 		if (net_ratelimit())
 			printk(KERN_DEBUG "unexpected AddBA Req from "
@@ -225,9 +256,8 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
 	}
 
 	/* prepare A-MPDU MLME for Rx aggregation */
-	sta->ampdu_mlme.tid_rx[tid] =
-			kmalloc(sizeof(struct tid_ampdu_rx), GFP_ATOMIC);
-	if (!sta->ampdu_mlme.tid_rx[tid]) {
+	tid_agg_rx = kmalloc(sizeof(struct tid_ampdu_rx), GFP_ATOMIC);
+	if (!tid_agg_rx) {
 #ifdef CONFIG_MAC80211_HT_DEBUG
 		if (net_ratelimit())
 			printk(KERN_ERR "allocate rx mlme to tid %d failed\n",
@@ -235,14 +265,18 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
 #endif
 		goto end;
 	}
+
+	spin_lock_init(&tid_agg_rx->reorder_lock);
+
 	/* rx timer */
-	sta->ampdu_mlme.tid_rx[tid]->session_timer.function =
-				sta_rx_agg_session_timer_expired;
-	sta->ampdu_mlme.tid_rx[tid]->session_timer.data =
-				(unsigned long)&sta->timer_to_tid[tid];
-	init_timer(&sta->ampdu_mlme.tid_rx[tid]->session_timer);
+	tid_agg_rx->session_timer.function = sta_rx_agg_session_timer_expired;
+	tid_agg_rx->session_timer.data = (unsigned long)&sta->timer_to_tid[tid];
+	init_timer(&tid_agg_rx->session_timer);
 
-	tid_agg_rx = sta->ampdu_mlme.tid_rx[tid];
+	/* rx reorder timer */
+	tid_agg_rx->reorder_timer.function = sta_rx_agg_reorder_timer_expired;
+	tid_agg_rx->reorder_timer.data = (unsigned long)&sta->timer_to_tid[tid];
+	init_timer(&tid_agg_rx->reorder_timer);
 
 	/* prepare reordering buffer */
 	tid_agg_rx->reorder_buf =
@@ -257,8 +291,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
 #endif
 		kfree(tid_agg_rx->reorder_buf);
 		kfree(tid_agg_rx->reorder_time);
-		kfree(sta->ampdu_mlme.tid_rx[tid]);
-		sta->ampdu_mlme.tid_rx[tid] = NULL;
+		kfree(tid_agg_rx);
 		goto end;
 	}
 
@@ -270,13 +303,12 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
 
 	if (ret) {
 		kfree(tid_agg_rx->reorder_buf);
+		kfree(tid_agg_rx->reorder_time);
 		kfree(tid_agg_rx);
-		sta->ampdu_mlme.tid_rx[tid] = NULL;
 		goto end;
 	}
 
-	/* change state and send addba resp */
-	sta->ampdu_mlme.tid_active_rx[tid] = true;
+	/* update data */
 	tid_agg_rx->dialog_token = dialog_token;
 	tid_agg_rx->ssn = start_seq_num;
 	tid_agg_rx->head_seq_num = start_seq_num;
@@ -284,8 +316,15 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
 	tid_agg_rx->timeout = timeout;
 	tid_agg_rx->stored_mpdu_num = 0;
 	status = WLAN_STATUS_SUCCESS;
+
+	/* activate it for RX */
+	rcu_assign_pointer(sta->ampdu_mlme.tid_rx[tid], tid_agg_rx);
+
+	if (timeout)
+		mod_timer(&tid_agg_rx->session_timer, TU_TO_EXP_TIME(timeout));
+
 end:
-	spin_unlock_bh(&sta->lock);
+	mutex_unlock(&sta->ampdu_mlme.mtx);
 
 end_no_lock:
 	ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid,
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 98258b7341e..d4679b265ba 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -6,7 +6,7 @@
  * Copyright 2005-2006, Devicescape Software, Inc.
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
- * Copyright 2007-2009, Intel Corporation
+ * Copyright 2007-2010, Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -21,28 +21,39 @@
 #include "wme.h"
 
 /**
- * DOC: TX aggregation
+ * DOC: TX A-MPDU aggregation
  *
  * Aggregation on the TX side requires setting the hardware flag
- * %IEEE80211_HW_AMPDU_AGGREGATION as well as, if present, the @ampdu_queues
- * hardware parameter to the number of hardware AMPDU queues. If there are no
- * hardware queues then the driver will (currently) have to do all frame
- * buffering.
+ * %IEEE80211_HW_AMPDU_AGGREGATION. The driver will then be handed
+ * packets with a flag indicating A-MPDU aggregation. The driver
+ * or device is responsible for actually aggregating the frames,
+ * as well as deciding how many and which to aggregate.
  *
- * When TX aggregation is started by some subsystem (usually the rate control
- * algorithm would be appropriate) by calling the
- * ieee80211_start_tx_ba_session() function, the driver will be notified via
- * its @ampdu_action function, with the %IEEE80211_AMPDU_TX_START action.
+ * When TX aggregation is started by some subsystem (usually the rate
+ * control algorithm would be appropriate) by calling the
+ * ieee80211_start_tx_ba_session() function, the driver will be
+ * notified via its @ampdu_action function, with the
+ * %IEEE80211_AMPDU_TX_START action.
  *
  * In response to that, the driver is later required to call the
- * ieee80211_start_tx_ba_cb() (or ieee80211_start_tx_ba_cb_irqsafe())
- * function, which will start the aggregation session.
+ * ieee80211_start_tx_ba_cb_irqsafe() function, which will really
+ * start the aggregation session after the peer has also responded.
+ * If the peer responds negatively, the session will be stopped
+ * again right away. Note that it is possible for the aggregation
+ * session to be stopped before the driver has indicated that it
+ * is done setting it up, in which case it must not indicate the
+ * setup completion.
  *
- * Similarly, when the aggregation session is stopped by
- * ieee80211_stop_tx_ba_session(), the driver's @ampdu_action function will
- * be called with the action %IEEE80211_AMPDU_TX_STOP. In this case, the
- * call must not fail, and the driver must later call ieee80211_stop_tx_ba_cb()
- * (or ieee80211_stop_tx_ba_cb_irqsafe()).
+ * Also note that, since we also need to wait for a response from
+ * the peer, the driver is notified of the completion of the
+ * handshake by the %IEEE80211_AMPDU_TX_OPERATIONAL action to the
+ * @ampdu_action callback.
+ *
+ * Similarly, when the aggregation session is stopped by the peer
+ * or something calling ieee80211_stop_tx_ba_session(), the driver's
+ * @ampdu_action function will be called with the action
+ * %IEEE80211_AMPDU_TX_STOP. In this case, the call must not fail,
+ * and the driver must later call ieee80211_stop_tx_ba_cb_irqsafe().
  */
 
 static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
@@ -125,25 +136,57 @@ void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u1
 	ieee80211_tx_skb(sdata, skb);
 }
 
+static void kfree_tid_tx(struct rcu_head *rcu_head)
+{
+	struct tid_ampdu_tx *tid_tx =
+	    container_of(rcu_head, struct tid_ampdu_tx, rcu_head);
+
+	kfree(tid_tx);
+}
+
 int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
-				    enum ieee80211_back_parties initiator)
+				    enum ieee80211_back_parties initiator,
+				    bool tx)
 {
 	struct ieee80211_local *local = sta->local;
+	struct tid_ampdu_tx *tid_tx = sta->ampdu_mlme.tid_tx[tid];
 	int ret;
-	u8 *state;
+
+	lockdep_assert_held(&sta->ampdu_mlme.mtx);
+
+	if (!tid_tx)
+		return -ENOENT;
+
+	spin_lock_bh(&sta->lock);
+
+	if (test_bit(HT_AGG_STATE_WANT_START, &tid_tx->state)) {
+		/* not even started yet! */
+		rcu_assign_pointer(sta->ampdu_mlme.tid_tx[tid], NULL);
+		spin_unlock_bh(&sta->lock);
+		call_rcu(&tid_tx->rcu_head, kfree_tid_tx);
+		return 0;
+	}
+
+	spin_unlock_bh(&sta->lock);
 
 #ifdef CONFIG_MAC80211_HT_DEBUG
 	printk(KERN_DEBUG "Tx BA session stop requested for %pM tid %u\n",
 	       sta->sta.addr, tid);
 #endif /* CONFIG_MAC80211_HT_DEBUG */
 
-	state = &sta->ampdu_mlme.tid_state_tx[tid];
+	set_bit(HT_AGG_STATE_STOPPING, &tid_tx->state);
 
-	if (*state == HT_AGG_STATE_OPERATIONAL)
-		sta->ampdu_mlme.addba_req_num[tid] = 0;
+	del_timer_sync(&tid_tx->addba_resp_timer);
+
+	/*
+	 * After this packets are no longer handed right through
+	 * to the driver but are put onto tid_tx->pending instead,
+	 * with locking to ensure proper access.
+	 */
+	clear_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state);
 
-	*state = HT_AGG_STATE_REQ_STOP_BA_MSK |
-		(initiator << HT_AGG_STATE_INITIATOR_SHIFT);
+	tid_tx->stop_initiator = initiator;
+	tid_tx->tx_stop = tx;
 
 	ret = drv_ampdu_action(local, sta->sdata,
 			       IEEE80211_AMPDU_TX_STOP,
@@ -174,16 +217,14 @@ static void sta_addba_resp_timer_expired(unsigned long data)
 	u16 tid = *(u8 *)data;
 	struct sta_info *sta = container_of((void *)data,
 		struct sta_info, timer_to_tid[tid]);
-	u8 *state;
-
-	state = &sta->ampdu_mlme.tid_state_tx[tid];
+	struct tid_ampdu_tx *tid_tx;
 
 	/* check if the TID waits for addBA response */
-	spin_lock_bh(&sta->lock);
-	if ((*state & (HT_ADDBA_REQUESTED_MSK | HT_ADDBA_RECEIVED_MSK |
-		       HT_AGG_STATE_REQ_STOP_BA_MSK)) !=
-						HT_ADDBA_REQUESTED_MSK) {
-		spin_unlock_bh(&sta->lock);
+	rcu_read_lock();
+	tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
+	if (!tid_tx ||
+	    test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) {
+		rcu_read_unlock();
 #ifdef CONFIG_MAC80211_HT_DEBUG
 		printk(KERN_DEBUG "timer expired on tid %d but we are not "
 				"(or no longer) expecting addBA response there\n",
@@ -196,8 +237,8 @@ static void sta_addba_resp_timer_expired(unsigned long data)
 	printk(KERN_DEBUG "addBA response timer expired on tid %d\n", tid);
 #endif
 
-	___ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR);
-	spin_unlock_bh(&sta->lock);
+	ieee80211_stop_tx_ba_session(&sta->sta, tid);
+	rcu_read_unlock();
 }
 
 static inline int ieee80211_ac_from_tid(int tid)
@@ -205,14 +246,112 @@ static inline int ieee80211_ac_from_tid(int tid)
 	return ieee802_1d_to_ac[tid & 7];
 }
 
+/*
+ * When multiple aggregation sessions on multiple stations
+ * are being created/destroyed simultaneously, we need to
+ * refcount the global queue stop caused by that in order
+ * to not get into a situation where one of the aggregation
+ * setup or teardown re-enables queues before the other is
+ * ready to handle that.
+ *
+ * These two functions take care of this issue by keeping
+ * a global "agg_queue_stop" refcount.
+ */
+static void __acquires(agg_queue)
+ieee80211_stop_queue_agg(struct ieee80211_local *local, int tid)
+{
+	int queue = ieee80211_ac_from_tid(tid);
+
+	if (atomic_inc_return(&local->agg_queue_stop[queue]) == 1)
+		ieee80211_stop_queue_by_reason(
+			&local->hw, queue,
+			IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+	__acquire(agg_queue);
+}
+
+static void __releases(agg_queue)
+ieee80211_wake_queue_agg(struct ieee80211_local *local, int tid)
+{
+	int queue = ieee80211_ac_from_tid(tid);
+
+	if (atomic_dec_return(&local->agg_queue_stop[queue]) == 0)
+		ieee80211_wake_queue_by_reason(
+			&local->hw, queue,
+			IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+	__release(agg_queue);
+}
+
+void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
+{
+	struct tid_ampdu_tx *tid_tx = sta->ampdu_mlme.tid_tx[tid];
+	struct ieee80211_local *local = sta->local;
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	u16 start_seq_num;
+	int ret;
+
+	lockdep_assert_held(&sta->ampdu_mlme.mtx);
+
+	/*
+	 * While we're asking the driver about the aggregation,
+	 * stop the AC queue so that we don't have to worry
+	 * about frames that came in while we were doing that,
+	 * which would require us to put them to the AC pending
+	 * afterwards which just makes the code more complex.
+	 */
+	ieee80211_stop_queue_agg(local, tid);
+
+	clear_bit(HT_AGG_STATE_WANT_START, &tid_tx->state);
+
+	/*
+	 * make sure no packets are being processed to get
+	 * valid starting sequence number
+	 */
+	synchronize_net();
+
+	start_seq_num = sta->tid_seq[tid] >> 4;
+
+	ret = drv_ampdu_action(local, sdata, IEEE80211_AMPDU_TX_START,
+			       &sta->sta, tid, &start_seq_num);
+	if (ret) {
+#ifdef CONFIG_MAC80211_HT_DEBUG
+		printk(KERN_DEBUG "BA request denied - HW unavailable for"
+					" tid %d\n", tid);
+#endif
+		spin_lock_bh(&sta->lock);
+		rcu_assign_pointer(sta->ampdu_mlme.tid_tx[tid], NULL);
+		spin_unlock_bh(&sta->lock);
+
+		ieee80211_wake_queue_agg(local, tid);
+		call_rcu(&tid_tx->rcu_head, kfree_tid_tx);
+		return;
+	}
+
+	/* we can take packets again now */
+	ieee80211_wake_queue_agg(local, tid);
+
+	/* activate the timer for the recipient's addBA response */
+	mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL);
+#ifdef CONFIG_MAC80211_HT_DEBUG
+	printk(KERN_DEBUG "activated addBA response timer on tid %d\n", tid);
+#endif
+
+	spin_lock_bh(&sta->lock);
+	sta->ampdu_mlme.addba_req_num[tid]++;
+	spin_unlock_bh(&sta->lock);
+
+	/* send AddBA request */
+	ieee80211_send_addba_request(sdata, sta->sta.addr, tid,
+				     tid_tx->dialog_token, start_seq_num,
+				     0x40, 5000);
+}
+
 int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid)
 {
 	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	struct ieee80211_local *local = sdata->local;
-	u8 *state;
+	struct tid_ampdu_tx *tid_tx;
 	int ret = 0;
-	u16 start_seq_num;
 
 	trace_api_start_tx_ba_session(pubsta, tid);
 
@@ -239,24 +378,15 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid)
 	    sdata->vif.type != NL80211_IFTYPE_AP)
 		return -EINVAL;
 
-	if (test_sta_flags(sta, WLAN_STA_DISASSOC)) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "Disassociation is in progress. "
-		       "Denying BA session request\n");
-#endif
-		return -EINVAL;
-	}
-
 	if (test_sta_flags(sta, WLAN_STA_BLOCK_BA)) {
 #ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "Suspend in progress. "
+		printk(KERN_DEBUG "BA sessions blocked. "
 		       "Denying BA session request\n");
 #endif
 		return -EINVAL;
 	}
 
 	spin_lock_bh(&sta->lock);
-	spin_lock(&local->ampdu_lock);
 
 	/* we have tried too many times, receiver does not want A-MPDU */
 	if (sta->ampdu_mlme.addba_req_num[tid] > HT_AGG_MAX_RETRIES) {
@@ -264,9 +394,9 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid)
 		goto err_unlock_sta;
 	}
 
-	state = &sta->ampdu_mlme.tid_state_tx[tid];
+	tid_tx = sta->ampdu_mlme.tid_tx[tid];
 	/* check if the TID is not in aggregation flow already */
-	if (*state != HT_AGG_STATE_IDLE) {
+	if (tid_tx) {
 #ifdef CONFIG_MAC80211_HT_DEBUG
 		printk(KERN_DEBUG "BA request denied - session is not "
 				 "idle on tid %u\n", tid);
@@ -275,96 +405,37 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid)
 		goto err_unlock_sta;
 	}
 
-	/*
-	 * While we're asking the driver about the aggregation,
-	 * stop the AC queue so that we don't have to worry
-	 * about frames that came in while we were doing that,
-	 * which would require us to put them to the AC pending
-	 * afterwards which just makes the code more complex.
-	 */
-	ieee80211_stop_queue_by_reason(
-		&local->hw, ieee80211_ac_from_tid(tid),
-		IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
-
 	/* prepare A-MPDU MLME for Tx aggregation */
-	sta->ampdu_mlme.tid_tx[tid] =
-			kmalloc(sizeof(struct tid_ampdu_tx), GFP_ATOMIC);
-	if (!sta->ampdu_mlme.tid_tx[tid]) {
+	tid_tx = kzalloc(sizeof(struct tid_ampdu_tx), GFP_ATOMIC);
+	if (!tid_tx) {
 #ifdef CONFIG_MAC80211_HT_DEBUG
 		if (net_ratelimit())
 			printk(KERN_ERR "allocate tx mlme to tid %d failed\n",
 					tid);
 #endif
 		ret = -ENOMEM;
-		goto err_wake_queue;
+		goto err_unlock_sta;
 	}
 
-	skb_queue_head_init(&sta->ampdu_mlme.tid_tx[tid]->pending);
+	skb_queue_head_init(&tid_tx->pending);
+	__set_bit(HT_AGG_STATE_WANT_START, &tid_tx->state);
 
 	/* Tx timer */
-	sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.function =
-			sta_addba_resp_timer_expired;
-	sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.data =
-			(unsigned long)&sta->timer_to_tid[tid];
-	init_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer);
-
-	/* Ok, the Addba frame hasn't been sent yet, but if the driver calls the
-	 * call back right away, it must see that the flow has begun */
-	*state |= HT_ADDBA_REQUESTED_MSK;
+	tid_tx->addba_resp_timer.function = sta_addba_resp_timer_expired;
+	tid_tx->addba_resp_timer.data = (unsigned long)&sta->timer_to_tid[tid];
+	init_timer(&tid_tx->addba_resp_timer);
 
-	start_seq_num = sta->tid_seq[tid] >> 4;
-
-	ret = drv_ampdu_action(local, sdata, IEEE80211_AMPDU_TX_START,
-			       pubsta, tid, &start_seq_num);
-
-	if (ret) {
-#ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "BA request denied - HW unavailable for"
-					" tid %d\n", tid);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
-		*state = HT_AGG_STATE_IDLE;
-		goto err_free;
-	}
-
-	/* Driver vetoed or OKed, but we can take packets again now */
-	ieee80211_wake_queue_by_reason(
-		&local->hw, ieee80211_ac_from_tid(tid),
-		IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
-
-	spin_unlock(&local->ampdu_lock);
-
-	/* prepare tid data */
+	/* assign a dialog token */
 	sta->ampdu_mlme.dialog_token_allocator++;
-	sta->ampdu_mlme.tid_tx[tid]->dialog_token =
-			sta->ampdu_mlme.dialog_token_allocator;
-	sta->ampdu_mlme.tid_tx[tid]->ssn = start_seq_num;
+	tid_tx->dialog_token = sta->ampdu_mlme.dialog_token_allocator;
 
-	spin_unlock_bh(&sta->lock);
+	/* finally, assign it to the array */
+	rcu_assign_pointer(sta->ampdu_mlme.tid_tx[tid], tid_tx);
 
-	/* send AddBA request */
-	ieee80211_send_addba_request(sdata, pubsta->addr, tid,
-			 sta->ampdu_mlme.tid_tx[tid]->dialog_token,
-			 sta->ampdu_mlme.tid_tx[tid]->ssn,
-			 0x40, 5000);
-	sta->ampdu_mlme.addba_req_num[tid]++;
-	/* activate the timer for the recipient's addBA response */
-	sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.expires =
-				jiffies + ADDBA_RESP_INTERVAL;
-	add_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer);
-#ifdef CONFIG_MAC80211_HT_DEBUG
-	printk(KERN_DEBUG "activated addBA response timer on tid %d\n", tid);
-#endif
-	return 0;
-
- err_free:
-	kfree(sta->ampdu_mlme.tid_tx[tid]);
-	sta->ampdu_mlme.tid_tx[tid] = NULL;
- err_wake_queue:
-	ieee80211_wake_queue_by_reason(
-		&local->hw, ieee80211_ac_from_tid(tid),
-		IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+	ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work);
+
+	/* this flow continues off the work */
  err_unlock_sta:
-	spin_unlock(&local->ampdu_lock);
 	spin_unlock_bh(&sta->lock);
 	return ret;
 }
@@ -372,69 +443,65 @@ EXPORT_SYMBOL(ieee80211_start_tx_ba_session);
 
 /*
  * splice packets from the STA's pending to the local pending,
- * requires a call to ieee80211_agg_splice_finish and holding
- * local->ampdu_lock across both calls.
+ * requires a call to ieee80211_agg_splice_finish later
  */
-static void ieee80211_agg_splice_packets(struct ieee80211_local *local,
-					 struct sta_info *sta, u16 tid)
+static void __acquires(agg_queue)
+ieee80211_agg_splice_packets(struct ieee80211_local *local,
+			     struct tid_ampdu_tx *tid_tx, u16 tid)
 {
+	int queue = ieee80211_ac_from_tid(tid);
 	unsigned long flags;
-	u16 queue = ieee80211_ac_from_tid(tid);
 
-	ieee80211_stop_queue_by_reason(
-		&local->hw, queue,
-		IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+	ieee80211_stop_queue_agg(local, tid);
 
-	if (!(sta->ampdu_mlme.tid_state_tx[tid] & HT_ADDBA_REQUESTED_MSK))
+	if (WARN(!tid_tx, "TID %d gone but expected when splicing aggregates"
+			  " from the pending queue\n", tid))
 		return;
 
-	if (WARN(!sta->ampdu_mlme.tid_tx[tid],
-		 "TID %d gone but expected when splicing aggregates from"
-		 "the pending queue\n", tid))
-		return;
-
-	if (!skb_queue_empty(&sta->ampdu_mlme.tid_tx[tid]->pending)) {
+	if (!skb_queue_empty(&tid_tx->pending)) {
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		/* copy over remaining packets */
-		skb_queue_splice_tail_init(
-			&sta->ampdu_mlme.tid_tx[tid]->pending,
-			&local->pending[queue]);
+		skb_queue_splice_tail_init(&tid_tx->pending,
+					   &local->pending[queue]);
 		spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
 	}
 }
 
-static void ieee80211_agg_splice_finish(struct ieee80211_local *local,
-					struct sta_info *sta, u16 tid)
+static void __releases(agg_queue)
+ieee80211_agg_splice_finish(struct ieee80211_local *local, u16 tid)
 {
-	u16 queue = ieee80211_ac_from_tid(tid);
-
-	ieee80211_wake_queue_by_reason(
-		&local->hw, queue,
-		IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
+	ieee80211_wake_queue_agg(local, tid);
 }
 
-/* caller must hold sta->lock */
 static void ieee80211_agg_tx_operational(struct ieee80211_local *local,
 					 struct sta_info *sta, u16 tid)
 {
+	lockdep_assert_held(&sta->ampdu_mlme.mtx);
+
 #ifdef CONFIG_MAC80211_HT_DEBUG
 	printk(KERN_DEBUG "Aggregation is on for tid %d\n", tid);
 #endif
 
-	spin_lock(&local->ampdu_lock);
-	ieee80211_agg_splice_packets(local, sta, tid);
-	/*
-	 * NB: we rely on sta->lock being taken in the TX
-	 * processing here when adding to the pending queue,
-	 * otherwise we could only change the state of the
-	 * session to OPERATIONAL _here_.
-	 */
-	ieee80211_agg_splice_finish(local, sta, tid);
-	spin_unlock(&local->ampdu_lock);
-
 	drv_ampdu_action(local, sta->sdata,
 			 IEEE80211_AMPDU_TX_OPERATIONAL,
 			 &sta->sta, tid, NULL);
+
+	/*
+	 * synchronize with TX path, while splicing the TX path
+	 * should block so it won't put more packets onto pending.
+	 */
+	spin_lock_bh(&sta->lock);
+
+	ieee80211_agg_splice_packets(local, sta->ampdu_mlme.tid_tx[tid], tid);
+	/*
+	 * Now mark as operational. This will be visible
+	 * in the TX path, and lets it go lock-free in
+	 * the common case.
+	 */
+	set_bit(HT_AGG_STATE_OPERATIONAL, &sta->ampdu_mlme.tid_tx[tid]->state);
+	ieee80211_agg_splice_finish(local, tid);
+
+	spin_unlock_bh(&sta->lock);
 }
 
 void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid)
@@ -442,7 +509,7 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid)
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
 	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
-	u8 *state;
+	struct tid_ampdu_tx *tid_tx;
 
 	trace_api_start_tx_ba_cb(sdata, ra, tid);
 
@@ -454,42 +521,36 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid)
 		return;
 	}
 
-	rcu_read_lock();
+	mutex_lock(&local->sta_mtx);
 	sta = sta_info_get(sdata, ra);
 	if (!sta) {
-		rcu_read_unlock();
+		mutex_unlock(&local->sta_mtx);
 #ifdef CONFIG_MAC80211_HT_DEBUG
 		printk(KERN_DEBUG "Could not find station: %pM\n", ra);
 #endif
 		return;
 	}
 
-	state = &sta->ampdu_mlme.tid_state_tx[tid];
-	spin_lock_bh(&sta->lock);
+	mutex_lock(&sta->ampdu_mlme.mtx);
+	tid_tx = sta->ampdu_mlme.tid_tx[tid];
 
-	if (WARN_ON(!(*state & HT_ADDBA_REQUESTED_MSK))) {
+	if (WARN_ON(!tid_tx)) {
 #ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "addBA was not requested yet, state is %d\n",
-				*state);
+		printk(KERN_DEBUG "addBA was not requested!\n");
 #endif
-		spin_unlock_bh(&sta->lock);
-		rcu_read_unlock();
-		return;
+		goto unlock;
 	}
 
-	if (WARN_ON(*state & HT_ADDBA_DRV_READY_MSK))
-		goto out;
-
-	*state |= HT_ADDBA_DRV_READY_MSK;
+	if (WARN_ON(test_and_set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state)))
+		goto unlock;
 
-	if (*state == HT_AGG_STATE_OPERATIONAL)
+	if (test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state))
 		ieee80211_agg_tx_operational(local, sta, tid);
 
- out:
-	spin_unlock_bh(&sta->lock);
-	rcu_read_unlock();
+ unlock:
+	mutex_unlock(&sta->ampdu_mlme.mtx);
+	mutex_unlock(&local->sta_mtx);
 }
-EXPORT_SYMBOL(ieee80211_start_tx_ba_cb);
 
 void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif,
 				      const u8 *ra, u16 tid)
@@ -510,44 +571,37 @@ void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif,
 	ra_tid = (struct ieee80211_ra_tid *) &skb->cb;
 	memcpy(&ra_tid->ra, ra, ETH_ALEN);
 	ra_tid->tid = tid;
-	ra_tid->vif = vif;
 
-	skb->pkt_type = IEEE80211_ADDBA_MSG;
-	skb_queue_tail(&local->skb_queue, skb);
-	tasklet_schedule(&local->tasklet);
+	skb->pkt_type = IEEE80211_SDATA_QUEUE_AGG_START;
+	skb_queue_tail(&sdata->skb_queue, skb);
+	ieee80211_queue_work(&local->hw, &sdata->work);
 }
 EXPORT_SYMBOL(ieee80211_start_tx_ba_cb_irqsafe);
 
 int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
-				   enum ieee80211_back_parties initiator)
+				   enum ieee80211_back_parties initiator,
+				   bool tx)
 {
-	u8 *state;
 	int ret;
 
-	/* check if the TID is in aggregation */
-	state = &sta->ampdu_mlme.tid_state_tx[tid];
-	spin_lock_bh(&sta->lock);
+	mutex_lock(&sta->ampdu_mlme.mtx);
 
-	if (*state != HT_AGG_STATE_OPERATIONAL) {
-		ret = -ENOENT;
-		goto unlock;
-	}
+	ret = ___ieee80211_stop_tx_ba_session(sta, tid, initiator, tx);
 
-	ret = ___ieee80211_stop_tx_ba_session(sta, tid, initiator);
+	mutex_unlock(&sta->ampdu_mlme.mtx);
 
- unlock:
-	spin_unlock_bh(&sta->lock);
 	return ret;
 }
 
-int ieee80211_stop_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid,
-				 enum ieee80211_back_parties initiator)
+int ieee80211_stop_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid)
 {
 	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	struct ieee80211_local *local = sdata->local;
+	struct tid_ampdu_tx *tid_tx;
+	int ret = 0;
 
-	trace_api_stop_tx_ba_session(pubsta, tid, initiator);
+	trace_api_stop_tx_ba_session(pubsta, tid);
 
 	if (!local->ops->ampdu_action)
 		return -EINVAL;
@@ -555,7 +609,26 @@ int ieee80211_stop_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid,
 	if (tid >= STA_TID_NUM)
 		return -EINVAL;
 
-	return __ieee80211_stop_tx_ba_session(sta, tid, initiator);
+	spin_lock_bh(&sta->lock);
+	tid_tx = sta->ampdu_mlme.tid_tx[tid];
+
+	if (!tid_tx) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+
+	if (test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) {
+		/* already in progress stopping it */
+		ret = 0;
+		goto unlock;
+	}
+
+	set_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state);
+	ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work);
+
+ unlock:
+	spin_unlock_bh(&sta->lock);
+	return ret;
 }
 EXPORT_SYMBOL(ieee80211_stop_tx_ba_session);
 
@@ -564,7 +637,7 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid)
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
 	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
-	u8 *state;
+	struct tid_ampdu_tx *tid_tx;
 
 	trace_api_stop_tx_ba_cb(sdata, ra, tid);
 
@@ -581,51 +654,56 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid)
 	       ra, tid);
 #endif /* CONFIG_MAC80211_HT_DEBUG */
 
-	rcu_read_lock();
+	mutex_lock(&local->sta_mtx);
+
 	sta = sta_info_get(sdata, ra);
 	if (!sta) {
 #ifdef CONFIG_MAC80211_HT_DEBUG
 		printk(KERN_DEBUG "Could not find station: %pM\n", ra);
 #endif
-		rcu_read_unlock();
-		return;
+		goto unlock;
 	}
-	state = &sta->ampdu_mlme.tid_state_tx[tid];
 
-	/* NOTE: no need to use sta->lock in this state check, as
-	 * ieee80211_stop_tx_ba_session will let only one stop call to
-	 * pass through per sta/tid
-	 */
-	if ((*state & HT_AGG_STATE_REQ_STOP_BA_MSK) == 0) {
+	mutex_lock(&sta->ampdu_mlme.mtx);
+	spin_lock_bh(&sta->lock);
+	tid_tx = sta->ampdu_mlme.tid_tx[tid];
+
+	if (!tid_tx || !test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state)) {
 #ifdef CONFIG_MAC80211_HT_DEBUG
 		printk(KERN_DEBUG "unexpected callback to A-MPDU stop\n");
 #endif
-		rcu_read_unlock();
-		return;
+		goto unlock_sta;
 	}
 
-	if (*state & HT_AGG_STATE_INITIATOR_MSK)
+	if (tid_tx->stop_initiator == WLAN_BACK_INITIATOR && tid_tx->tx_stop)
 		ieee80211_send_delba(sta->sdata, ra, tid,
 			WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE);
 
-	spin_lock_bh(&sta->lock);
-	spin_lock(&local->ampdu_lock);
+	/*
+	 * When we get here, the TX path will not be lockless any more wrt.
+	 * aggregation, since the OPERATIONAL bit has long been cleared.
+	 * Thus it will block on getting the lock, if it occurs. So if we
+	 * stop the queue now, we will not get any more packets, and any
+	 * that might be being processed will wait for us here, thereby
+	 * guaranteeing that no packets go to the tid_tx pending queue any
+	 * more.
+	 */
 
-	ieee80211_agg_splice_packets(local, sta, tid);
+	ieee80211_agg_splice_packets(local, tid_tx, tid);
 
-	*state = HT_AGG_STATE_IDLE;
-	/* from now on packets are no longer put onto sta->pending */
-	kfree(sta->ampdu_mlme.tid_tx[tid]);
-	sta->ampdu_mlme.tid_tx[tid] = NULL;
+	/* future packets must not find the tid_tx struct any more */
+	rcu_assign_pointer(sta->ampdu_mlme.tid_tx[tid], NULL);
 
-	ieee80211_agg_splice_finish(local, sta, tid);
+	ieee80211_agg_splice_finish(local, tid);
 
-	spin_unlock(&local->ampdu_lock);
-	spin_unlock_bh(&sta->lock);
+	call_rcu(&tid_tx->rcu_head, kfree_tid_tx);
 
-	rcu_read_unlock();
+ unlock_sta:
+	spin_unlock_bh(&sta->lock);
+	mutex_unlock(&sta->ampdu_mlme.mtx);
+ unlock:
+	mutex_unlock(&local->sta_mtx);
 }
-EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb);
 
 void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif,
 				     const u8 *ra, u16 tid)
@@ -646,11 +724,10 @@ void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif,
 	ra_tid = (struct ieee80211_ra_tid *) &skb->cb;
 	memcpy(&ra_tid->ra, ra, ETH_ALEN);
 	ra_tid->tid = tid;
-	ra_tid->vif = vif;
 
-	skb->pkt_type = IEEE80211_DELBA_MSG;
-	skb_queue_tail(&local->skb_queue, skb);
-	tasklet_schedule(&local->tasklet);
+	skb->pkt_type = IEEE80211_SDATA_QUEUE_AGG_STOP;
+	skb_queue_tail(&sdata->skb_queue, skb);
+	ieee80211_queue_work(&local->hw, &sdata->work);
 }
 EXPORT_SYMBOL(ieee80211_stop_tx_ba_cb_irqsafe);
 
@@ -660,47 +737,48 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
 				  struct ieee80211_mgmt *mgmt,
 				  size_t len)
 {
+	struct tid_ampdu_tx *tid_tx;
 	u16 capab, tid;
-	u8 *state;
 
 	capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab);
 	tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2;
 
-	state = &sta->ampdu_mlme.tid_state_tx[tid];
-
-	spin_lock_bh(&sta->lock);
+	mutex_lock(&sta->ampdu_mlme.mtx);
 
-	if (!(*state & HT_ADDBA_REQUESTED_MSK))
+	tid_tx = sta->ampdu_mlme.tid_tx[tid];
+	if (!tid_tx)
 		goto out;
 
-	if (mgmt->u.action.u.addba_resp.dialog_token !=
-		sta->ampdu_mlme.tid_tx[tid]->dialog_token) {
+	if (mgmt->u.action.u.addba_resp.dialog_token != tid_tx->dialog_token) {
 #ifdef CONFIG_MAC80211_HT_DEBUG
 		printk(KERN_DEBUG "wrong addBA response token, tid %d\n", tid);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
+#endif
 		goto out;
 	}
 
-	del_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer);
+	del_timer(&tid_tx->addba_resp_timer);
 
 #ifdef CONFIG_MAC80211_HT_DEBUG
 	printk(KERN_DEBUG "switched off addBA timer for tid %d\n", tid);
-#endif /* CONFIG_MAC80211_HT_DEBUG */
+#endif
 
 	if (le16_to_cpu(mgmt->u.action.u.addba_resp.status)
 			== WLAN_STATUS_SUCCESS) {
-		u8 curstate = *state;
-
-		*state |= HT_ADDBA_RECEIVED_MSK;
+		if (test_and_set_bit(HT_AGG_STATE_RESPONSE_RECEIVED,
+				     &tid_tx->state)) {
+			/* ignore duplicate response */
+			goto out;
+		}
 
-		if (*state != curstate && *state == HT_AGG_STATE_OPERATIONAL)
+		if (test_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state))
 			ieee80211_agg_tx_operational(local, sta, tid);
 
 		sta->ampdu_mlme.addba_req_num[tid] = 0;
 	} else {
-		___ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR);
+		___ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR,
+						true);
 	}
 
  out:
-	spin_unlock_bh(&sta->lock);
+	mutex_unlock(&sta->ampdu_mlme.mtx);
 }
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 67ee34f57df..18bd0e55060 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -19,33 +19,6 @@
 #include "rate.h"
 #include "mesh.h"
 
-static bool nl80211_type_check(enum nl80211_iftype type)
-{
-	switch (type) {
-	case NL80211_IFTYPE_ADHOC:
-	case NL80211_IFTYPE_STATION:
-	case NL80211_IFTYPE_MONITOR:
-#ifdef CONFIG_MAC80211_MESH
-	case NL80211_IFTYPE_MESH_POINT:
-#endif
-	case NL80211_IFTYPE_AP:
-	case NL80211_IFTYPE_AP_VLAN:
-	case NL80211_IFTYPE_WDS:
-		return true;
-	default:
-		return false;
-	}
-}
-
-static bool nl80211_params_check(enum nl80211_iftype type,
-				 struct vif_params *params)
-{
-	if (!nl80211_type_check(type))
-		return false;
-
-	return true;
-}
-
 static int ieee80211_add_iface(struct wiphy *wiphy, char *name,
 			       enum nl80211_iftype type, u32 *flags,
 			       struct vif_params *params)
@@ -55,9 +28,6 @@ static int ieee80211_add_iface(struct wiphy *wiphy, char *name,
 	struct ieee80211_sub_if_data *sdata;
 	int err;
 
-	if (!nl80211_params_check(type, params))
-		return -EINVAL;
-
 	err = ieee80211_if_add(local, name, &dev, type, params);
 	if (err || type != NL80211_IFTYPE_MONITOR || !flags)
 		return err;
@@ -82,12 +52,6 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	int ret;
 
-	if (ieee80211_sdata_running(sdata))
-		return -EBUSY;
-
-	if (!nl80211_params_check(type, params))
-		return -EINVAL;
-
 	ret = ieee80211_if_change_type(sdata, type);
 	if (ret)
 		return ret;
@@ -104,69 +68,95 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
 		 params && params->use_4addr >= 0)
 		sdata->u.mgd.use_4addr = params->use_4addr;
 
-	if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags)
-		sdata->u.mntr_flags = *flags;
+	if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) {
+		struct ieee80211_local *local = sdata->local;
+
+		if (ieee80211_sdata_running(sdata)) {
+			/*
+			 * Prohibit MONITOR_FLAG_COOK_FRAMES to be
+			 * changed while the interface is up.
+			 * Else we would need to add a lot of cruft
+			 * to update everything:
+			 *	cooked_mntrs, monitor and all fif_* counters
+			 *	reconfigure hardware
+			 */
+			if ((*flags & MONITOR_FLAG_COOK_FRAMES) !=
+			    (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES))
+				return -EBUSY;
+
+			ieee80211_adjust_monitor_flags(sdata, -1);
+			sdata->u.mntr_flags = *flags;
+			ieee80211_adjust_monitor_flags(sdata, 1);
+
+			ieee80211_configure_filter(local);
+		} else {
+			/*
+			 * Because the interface is down, ieee80211_do_stop
+			 * and ieee80211_do_open take care of "everything"
+			 * mentioned in the comment above.
+			 */
+			sdata->u.mntr_flags = *flags;
+		}
+	}
 
 	return 0;
 }
 
 static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
-			     u8 key_idx, const u8 *mac_addr,
+			     u8 key_idx, bool pairwise, const u8 *mac_addr,
 			     struct key_params *params)
 {
-	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct sta_info *sta = NULL;
-	enum ieee80211_key_alg alg;
 	struct ieee80211_key *key;
 	int err;
 
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	if (!ieee80211_sdata_running(sdata))
+		return -ENETDOWN;
 
+	/* reject WEP and TKIP keys if WEP failed to initialize */
 	switch (params->cipher) {
 	case WLAN_CIPHER_SUITE_WEP40:
-	case WLAN_CIPHER_SUITE_WEP104:
-		alg = ALG_WEP;
-		break;
 	case WLAN_CIPHER_SUITE_TKIP:
-		alg = ALG_TKIP;
-		break;
-	case WLAN_CIPHER_SUITE_CCMP:
-		alg = ALG_CCMP;
-		break;
-	case WLAN_CIPHER_SUITE_AES_CMAC:
-		alg = ALG_AES_CMAC;
+	case WLAN_CIPHER_SUITE_WEP104:
+		if (IS_ERR(sdata->local->wep_tx_tfm))
+			return -EINVAL;
 		break;
 	default:
-		return -EINVAL;
+		break;
 	}
 
-	key = ieee80211_key_alloc(alg, key_idx, params->key_len, params->key,
-				  params->seq_len, params->seq);
-	if (!key)
-		return -ENOMEM;
+	key = ieee80211_key_alloc(params->cipher, key_idx, params->key_len,
+				  params->key, params->seq_len, params->seq);
+	if (IS_ERR(key))
+		return PTR_ERR(key);
 
-	rcu_read_lock();
+	if (pairwise)
+		key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE;
+
+	mutex_lock(&sdata->local->sta_mtx);
 
 	if (mac_addr) {
 		sta = sta_info_get_bss(sdata, mac_addr);
 		if (!sta) {
-			ieee80211_key_free(key);
+			ieee80211_key_free(sdata->local, key);
 			err = -ENOENT;
 			goto out_unlock;
 		}
 	}
 
-	ieee80211_key_link(key, sdata, sta);
+	err = ieee80211_key_link(key, sdata, sta);
+	if (err)
+		ieee80211_key_free(sdata->local, key);
 
-	err = 0;
  out_unlock:
-	rcu_read_unlock();
+	mutex_unlock(&sdata->local->sta_mtx);
 
 	return err;
 }
 
 static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
-			     u8 key_idx, const u8 *mac_addr)
+			     u8 key_idx, bool pairwise, const u8 *mac_addr)
 {
 	struct ieee80211_sub_if_data *sdata;
 	struct sta_info *sta;
@@ -174,7 +164,7 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
 
 	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	rcu_read_lock();
+	mutex_lock(&sdata->local->sta_mtx);
 
 	if (mac_addr) {
 		ret = -ENOENT;
@@ -183,10 +173,17 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
 		if (!sta)
 			goto out_unlock;
 
-		if (sta->key) {
-			ieee80211_key_free(sta->key);
-			WARN_ON(sta->key);
-			ret = 0;
+		if (pairwise) {
+			if (sta->ptk) {
+				ieee80211_key_free(sdata->local, sta->ptk);
+				ret = 0;
+			}
+		} else {
+			if (sta->gtk[key_idx]) {
+				ieee80211_key_free(sdata->local,
+						   sta->gtk[key_idx]);
+				ret = 0;
+			}
 		}
 
 		goto out_unlock;
@@ -197,18 +194,19 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
 		goto out_unlock;
 	}
 
-	ieee80211_key_free(sdata->keys[key_idx]);
+	ieee80211_key_free(sdata->local, sdata->keys[key_idx]);
 	WARN_ON(sdata->keys[key_idx]);
 
 	ret = 0;
  out_unlock:
-	rcu_read_unlock();
+	mutex_unlock(&sdata->local->sta_mtx);
 
 	return ret;
 }
 
 static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
-			     u8 key_idx, const u8 *mac_addr, void *cookie,
+			     u8 key_idx, bool pairwise, const u8 *mac_addr,
+			     void *cookie,
 			     void (*callback)(void *cookie,
 					      struct key_params *params))
 {
@@ -216,7 +214,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
 	struct sta_info *sta = NULL;
 	u8 seq[6] = {0};
 	struct key_params params;
-	struct ieee80211_key *key;
+	struct ieee80211_key *key = NULL;
 	u32 iv32;
 	u16 iv16;
 	int err = -ENOENT;
@@ -230,7 +228,10 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
 		if (!sta)
 			goto out;
 
-		key = sta->key;
+		if (pairwise)
+			key = sta->ptk;
+		else if (key_idx < NUM_DEFAULT_KEYS)
+			key = sta->gtk[key_idx];
 	} else
 		key = sdata->keys[key_idx];
 
@@ -239,10 +240,10 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
 
 	memset(&params, 0, sizeof(params));
 
-	switch (key->conf.alg) {
-	case ALG_TKIP:
-		params.cipher = WLAN_CIPHER_SUITE_TKIP;
+	params.cipher = key->conf.cipher;
 
+	switch (key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_TKIP:
 		iv32 = key->u.tkip.tx.iv32;
 		iv16 = key->u.tkip.tx.iv16;
 
@@ -260,8 +261,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
 		params.seq = seq;
 		params.seq_len = 6;
 		break;
-	case ALG_CCMP:
-		params.cipher = WLAN_CIPHER_SUITE_CCMP;
+	case WLAN_CIPHER_SUITE_CCMP:
 		seq[0] = key->u.ccmp.tx_pn[5];
 		seq[1] = key->u.ccmp.tx_pn[4];
 		seq[2] = key->u.ccmp.tx_pn[3];
@@ -271,14 +271,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
 		params.seq = seq;
 		params.seq_len = 6;
 		break;
-	case ALG_WEP:
-		if (key->conf.keylen == 5)
-			params.cipher = WLAN_CIPHER_SUITE_WEP40;
-		else
-			params.cipher = WLAN_CIPHER_SUITE_WEP104;
-		break;
-	case ALG_AES_CMAC:
-		params.cipher = WLAN_CIPHER_SUITE_AES_CMAC;
+	case WLAN_CIPHER_SUITE_AES_CMAC:
 		seq[0] = key->u.aes_cmac.tx_pn[5];
 		seq[1] = key->u.aes_cmac.tx_pn[4];
 		seq[2] = key->u.aes_cmac.tx_pn[3];
@@ -305,15 +298,10 @@ static int ieee80211_config_default_key(struct wiphy *wiphy,
 					struct net_device *dev,
 					u8 key_idx)
 {
-	struct ieee80211_sub_if_data *sdata;
-
-	rcu_read_lock();
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	ieee80211_set_default_key(sdata, key_idx);
 
-	rcu_read_unlock();
-
 	return 0;
 }
 
@@ -321,15 +309,10 @@ static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy,
 					     struct net_device *dev,
 					     u8 key_idx)
 {
-	struct ieee80211_sub_if_data *sdata;
-
-	rcu_read_lock();
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	ieee80211_set_default_mgmt_key(sdata, key_idx);
 
-	rcu_read_unlock();
-
 	return 0;
 }
 
@@ -344,13 +327,19 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
 			STATION_INFO_TX_BYTES |
 			STATION_INFO_RX_PACKETS |
 			STATION_INFO_TX_PACKETS |
-			STATION_INFO_TX_BITRATE;
+			STATION_INFO_TX_RETRIES |
+			STATION_INFO_TX_FAILED |
+			STATION_INFO_TX_BITRATE |
+			STATION_INFO_RX_DROP_MISC;
 
 	sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx);
 	sinfo->rx_bytes = sta->rx_bytes;
 	sinfo->tx_bytes = sta->tx_bytes;
 	sinfo->rx_packets = sta->rx_packets;
 	sinfo->tx_packets = sta->tx_packets;
+	sinfo->tx_retries = sta->tx_retry_count;
+	sinfo->tx_failed = sta->tx_retry_failed;
+	sinfo->rx_dropped_misc = sta->rx_dropped;
 
 	if ((sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) ||
 	    (sta->local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)) {
@@ -415,9 +404,6 @@ static int ieee80211_dump_survey(struct wiphy *wiphy, struct net_device *dev,
 {
 	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
 
-	if (!local->ops->get_survey)
-		return -EOPNOTSUPP;
-
 	return drv_get_survey(local, idx, survey);
 }
 
@@ -600,7 +586,7 @@ struct iapp_layer2_update {
 	u8 ssap;		/* 0 */
 	u8 control;
 	u8 xid_info[3];
-} __attribute__ ((packed));
+} __packed;
 
 static void ieee80211_send_layer2_update(struct sta_info *sta)
 {
@@ -639,6 +625,7 @@ static void sta_apply_parameters(struct ieee80211_local *local,
 				 struct sta_info *sta,
 				 struct station_parameters *params)
 {
+	unsigned long flags;
 	u32 rates;
 	int i, j;
 	struct ieee80211_supported_band *sband;
@@ -647,7 +634,7 @@ static void sta_apply_parameters(struct ieee80211_local *local,
 
 	sband = local->hw.wiphy->bands[local->oper_channel->band];
 
-	spin_lock_bh(&sta->lock);
+	spin_lock_irqsave(&sta->flaglock, flags);
 	mask = params->sta_flags_mask;
 	set = params->sta_flags_set;
 
@@ -674,7 +661,7 @@ static void sta_apply_parameters(struct ieee80211_local *local,
 		if (set & BIT(NL80211_STA_FLAG_MFP))
 			sta->flags |= WLAN_STA_MFP;
 	}
-	spin_unlock_bh(&sta->lock);
+	spin_unlock_irqrestore(&sta->flaglock, flags);
 
 	/*
 	 * cfg80211 validates this (1-2007) and allows setting the AID
@@ -1148,16 +1135,12 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy,
 	p.uapsd = false;
 
 	if (drv_conf_tx(local, params->queue, &p)) {
-		printk(KERN_DEBUG "%s: failed to set TX queue "
-		       "parameters for queue %d\n",
-		       wiphy_name(local->hw.wiphy), params->queue);
+		wiphy_debug(local->hw.wiphy,
+			    "failed to set TX queue parameters for queue %d\n",
+			    params->queue);
 		return -EINVAL;
 	}
 
-	/* enable WMM or activate new settings */
-	local->hw.conf.flags |= IEEE80211_CONF_QOS;
-	drv_config(local, IEEE80211_CONF_CHANGE_QOS);
-
 	return 0;
 }
 
@@ -1216,15 +1199,26 @@ static int ieee80211_scan(struct wiphy *wiphy,
 			  struct net_device *dev,
 			  struct cfg80211_scan_request *req)
 {
-	struct ieee80211_sub_if_data *sdata;
-
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
-	if (sdata->vif.type != NL80211_IFTYPE_STATION &&
-	    sdata->vif.type != NL80211_IFTYPE_ADHOC &&
-	    sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
-	    (sdata->vif.type != NL80211_IFTYPE_AP || sdata->u.ap.beacon))
+	switch (ieee80211_vif_type_p2p(&sdata->vif)) {
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_ADHOC:
+	case NL80211_IFTYPE_MESH_POINT:
+	case NL80211_IFTYPE_P2P_CLIENT:
+		break;
+	case NL80211_IFTYPE_P2P_GO:
+		if (sdata->local->ops->hw_scan)
+			break;
+		/* FIXME: implement NoA while scanning in software */
+		return -EOPNOTSUPP;
+	case NL80211_IFTYPE_AP:
+		if (sdata->u.ap.beacon)
+			return -EOPNOTSUPP;
+		break;
+	default:
 		return -EOPNOTSUPP;
+	}
 
 	return ieee80211_request_scan(sdata, req);
 }
@@ -1331,28 +1325,28 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
 }
 
 static int ieee80211_set_tx_power(struct wiphy *wiphy,
-				  enum tx_power_setting type, int dbm)
+				  enum nl80211_tx_power_setting type, int mbm)
 {
 	struct ieee80211_local *local = wiphy_priv(wiphy);
 	struct ieee80211_channel *chan = local->hw.conf.channel;
 	u32 changes = 0;
 
 	switch (type) {
-	case TX_POWER_AUTOMATIC:
+	case NL80211_TX_POWER_AUTOMATIC:
 		local->user_power_level = -1;
 		break;
-	case TX_POWER_LIMITED:
-		if (dbm < 0)
-			return -EINVAL;
-		local->user_power_level = dbm;
+	case NL80211_TX_POWER_LIMITED:
+		if (mbm < 0 || (mbm % 100))
+			return -EOPNOTSUPP;
+		local->user_power_level = MBM_TO_DBM(mbm);
 		break;
-	case TX_POWER_FIXED:
-		if (dbm < 0)
-			return -EINVAL;
+	case NL80211_TX_POWER_FIXED:
+		if (mbm < 0 || (mbm % 100))
+			return -EOPNOTSUPP;
 		/* TODO: move to cfg80211 when it knows the channel */
-		if (dbm > chan->max_power)
+		if (MBM_TO_DBM(mbm) > chan->max_power)
 			return -EINVAL;
-		local->user_power_level = dbm;
+		local->user_power_level = MBM_TO_DBM(mbm);
 		break;
 	}
 
@@ -1371,7 +1365,7 @@ static int ieee80211_get_tx_power(struct wiphy *wiphy, int *dbm)
 }
 
 static int ieee80211_set_wds_peer(struct wiphy *wiphy, struct net_device *dev,
-				  u8 *addr)
+				  const u8 *addr)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 
@@ -1420,7 +1414,7 @@ int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata,
 	if (!sdata->u.mgd.associated ||
 	    sdata->vif.bss_conf.channel_type == NL80211_CHAN_NO_HT) {
 		mutex_lock(&sdata->local->iflist_mtx);
-		ieee80211_recalc_smps(sdata->local, sdata);
+		ieee80211_recalc_smps(sdata->local);
 		mutex_unlock(&sdata->local->iflist_mtx);
 		return 0;
 	}
@@ -1448,7 +1442,6 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
-	struct ieee80211_conf *conf = &local->hw.conf;
 
 	if (sdata->vif.type != NL80211_IFTYPE_STATION)
 		return -EOPNOTSUPP;
@@ -1457,11 +1450,11 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
 		return -EOPNOTSUPP;
 
 	if (enabled == sdata->u.mgd.powersave &&
-	    timeout == conf->dynamic_ps_forced_timeout)
+	    timeout == local->dynamic_ps_forced_timeout)
 		return 0;
 
 	sdata->u.mgd.powersave = enabled;
-	conf->dynamic_ps_forced_timeout = timeout;
+	local->dynamic_ps_forced_timeout = timeout;
 
 	/* no change, but if automatic follow powersave */
 	mutex_lock(&sdata->u.mgd.mtx);
@@ -1551,13 +1544,81 @@ static int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy,
 	return ieee80211_wk_cancel_remain_on_channel(sdata, cookie);
 }
 
-static int ieee80211_action(struct wiphy *wiphy, struct net_device *dev,
-			    struct ieee80211_channel *chan,
-			    enum nl80211_channel_type channel_type,
-			    const u8 *buf, size_t len, u64 *cookie)
+static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct net_device *dev,
+			     struct ieee80211_channel *chan,
+			     enum nl80211_channel_type channel_type,
+			     bool channel_type_valid,
+			     const u8 *buf, size_t len, u64 *cookie)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct sta_info *sta;
+	const struct ieee80211_mgmt *mgmt = (void *)buf;
+	u32 flags = IEEE80211_TX_INTFL_NL80211_FRAME_TX |
+		    IEEE80211_TX_CTL_REQ_TX_STATUS;
+
+	/* Check that we are on the requested channel for transmission */
+	if (chan != local->tmp_channel &&
+	    chan != local->oper_channel)
+		return -EBUSY;
+	if (channel_type_valid &&
+	    (channel_type != local->tmp_channel_type &&
+	     channel_type != local->_oper_channel_type))
+		return -EBUSY;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_ADHOC:
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_AP_VLAN:
+	case NL80211_IFTYPE_P2P_GO:
+		if (!ieee80211_is_action(mgmt->frame_control) ||
+		    mgmt->u.action.category == WLAN_CATEGORY_PUBLIC)
+			break;
+		rcu_read_lock();
+		sta = sta_info_get(sdata, mgmt->da);
+		rcu_read_unlock();
+		if (!sta)
+			return -ENOLINK;
+		break;
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_P2P_CLIENT:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + len);
+	if (!skb)
+		return -ENOMEM;
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+
+	memcpy(skb_put(skb, len), buf, len);
+
+	IEEE80211_SKB_CB(skb)->flags = flags;
+
+	skb->dev = sdata->dev;
+	ieee80211_tx_skb(sdata, skb);
+
+	*cookie = (unsigned long) skb;
+	return 0;
+}
+
+static void ieee80211_mgmt_frame_register(struct wiphy *wiphy,
+					  struct net_device *dev,
+					  u16 frame_type, bool reg)
 {
-	return ieee80211_mgd_action(IEEE80211_DEV_TO_SUB_IF(dev), chan,
-				    channel_type, buf, len, cookie);
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+
+	if (frame_type != (IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ))
+		return;
+
+	if (reg)
+		local->probe_req_reg++;
+	else
+		local->probe_req_reg--;
+
+	ieee80211_queue_work(&local->hw, &local->reconfig_filter);
 }
 
 struct cfg80211_ops mac80211_config_ops = {
@@ -1609,6 +1670,7 @@ struct cfg80211_ops mac80211_config_ops = {
 	.set_bitrate_mask = ieee80211_set_bitrate_mask,
 	.remain_on_channel = ieee80211_remain_on_channel,
 	.cancel_remain_on_channel = ieee80211_cancel_remain_on_channel,
-	.action = ieee80211_action,
+	.mgmt_tx = ieee80211_mgmt_tx,
 	.set_cqm_rssi_config = ieee80211_set_cqm_rssi_config,
+	.mgmt_frame_register = ieee80211_mgmt_frame_register,
 };
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 32be11e4c4d..5b24740fc0b 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -11,7 +11,7 @@ __ieee80211_get_channel_mode(struct ieee80211_local *local,
 {
 	struct ieee80211_sub_if_data *sdata;
 
-	WARN_ON(!mutex_is_locked(&local->iflist_mtx));
+	lockdep_assert_held(&local->iflist_mtx);
 
 	list_for_each_entry(sdata, &local->interfaces, list) {
 		if (sdata == ignore)
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 637929b65cc..18260aa99c5 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -36,6 +36,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf,	\
 static const struct file_operations name## _ops = {			\
 	.read = name## _read,						\
 	.open = mac80211_open_file_generic,				\
+	.llseek = generic_file_llseek,					\
 };
 
 #define DEBUGFS_ADD(name)						\
@@ -85,13 +86,15 @@ static ssize_t tsf_write(struct file *file,
 	if (strncmp(buf, "reset", 5) == 0) {
 		if (local->ops->reset_tsf) {
 			drv_reset_tsf(local);
-			printk(KERN_INFO "%s: debugfs reset TSF\n", wiphy_name(local->hw.wiphy));
+			wiphy_info(local->hw.wiphy, "debugfs reset TSF\n");
 		}
 	} else {
 		tsf = simple_strtoul(buf, NULL, 0);
 		if (local->ops->set_tsf) {
 			drv_set_tsf(local, tsf);
-			printk(KERN_INFO "%s: debugfs set TSF to %#018llx\n", wiphy_name(local->hw.wiphy), tsf);
+			wiphy_info(local->hw.wiphy,
+				   "debugfs set TSF to %#018llx\n", tsf);
+
 		}
 	}
 
@@ -101,7 +104,8 @@ static ssize_t tsf_write(struct file *file,
 static const struct file_operations tsf_ops = {
 	.read = tsf_read,
 	.write = tsf_write,
-	.open = mac80211_open_file_generic
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t reset_write(struct file *file, const char __user *user_buf,
@@ -120,6 +124,7 @@ static ssize_t reset_write(struct file *file, const char __user *user_buf,
 static const struct file_operations reset_ops = {
 	.write = reset_write,
 	.open = mac80211_open_file_generic,
+	.llseek = noop_llseek,
 };
 
 static ssize_t noack_read(struct file *file, char __user *user_buf,
@@ -155,7 +160,8 @@ static ssize_t noack_write(struct file *file,
 static const struct file_operations noack_ops = {
 	.read = noack_read,
 	.write = noack_write,
-	.open = mac80211_open_file_generic
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t uapsd_queues_read(struct file *file, char __user *user_buf,
@@ -201,7 +207,8 @@ static ssize_t uapsd_queues_write(struct file *file,
 static const struct file_operations uapsd_queues_ops = {
 	.read = uapsd_queues_read,
 	.write = uapsd_queues_write,
-	.open = mac80211_open_file_generic
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t uapsd_max_sp_len_read(struct file *file, char __user *user_buf,
@@ -247,7 +254,8 @@ static ssize_t uapsd_max_sp_len_write(struct file *file,
 static const struct file_operations uapsd_max_sp_len_ops = {
 	.read = uapsd_max_sp_len_read,
 	.write = uapsd_max_sp_len_write,
-	.open = mac80211_open_file_generic
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t channel_type_read(struct file *file, char __user *user_buf,
@@ -279,7 +287,8 @@ static ssize_t channel_type_read(struct file *file, char __user *user_buf,
 
 static const struct file_operations channel_type_ops = {
 	.read = channel_type_read,
-	.open = mac80211_open_file_generic
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
 };
 
 static ssize_t queues_read(struct file *file, char __user *user_buf,
@@ -302,14 +311,12 @@ static ssize_t queues_read(struct file *file, char __user *user_buf,
 
 static const struct file_operations queues_ops = {
 	.read = queues_read,
-	.open = mac80211_open_file_generic
+	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
 };
 
 /* statistics stuff */
 
-#define DEBUGFS_STATS_FILE(name, buflen, fmt, value...)			\
-	DEBUGFS_READONLY_FILE(stats_ ##name, buflen, fmt, ##value)
-
 static ssize_t format_devstat_counter(struct ieee80211_local *local,
 	char __user *userbuf,
 	size_t count, loff_t *ppos,
@@ -349,77 +356,19 @@ static ssize_t stats_ ##name## _read(struct file *file,			\
 static const struct file_operations stats_ ##name## _ops = {		\
 	.read = stats_ ##name## _read,					\
 	.open = mac80211_open_file_generic,				\
+	.llseek = generic_file_llseek,					\
 };
 
-#define DEBUGFS_STATS_ADD(name)						\
+#define DEBUGFS_STATS_ADD(name, field)					\
+	debugfs_create_u32(#name, 0400, statsd, (u32 *) &field);
+#define DEBUGFS_DEVSTATS_ADD(name)					\
 	debugfs_create_file(#name, 0400, statsd, local, &stats_ ##name## _ops);
 
-DEBUGFS_STATS_FILE(transmitted_fragment_count, 20, "%u",
-		   local->dot11TransmittedFragmentCount);
-DEBUGFS_STATS_FILE(multicast_transmitted_frame_count, 20, "%u",
-		   local->dot11MulticastTransmittedFrameCount);
-DEBUGFS_STATS_FILE(failed_count, 20, "%u",
-		   local->dot11FailedCount);
-DEBUGFS_STATS_FILE(retry_count, 20, "%u",
-		   local->dot11RetryCount);
-DEBUGFS_STATS_FILE(multiple_retry_count, 20, "%u",
-		   local->dot11MultipleRetryCount);
-DEBUGFS_STATS_FILE(frame_duplicate_count, 20, "%u",
-		   local->dot11FrameDuplicateCount);
-DEBUGFS_STATS_FILE(received_fragment_count, 20, "%u",
-		   local->dot11ReceivedFragmentCount);
-DEBUGFS_STATS_FILE(multicast_received_frame_count, 20, "%u",
-		   local->dot11MulticastReceivedFrameCount);
-DEBUGFS_STATS_FILE(transmitted_frame_count, 20, "%u",
-		   local->dot11TransmittedFrameCount);
-#ifdef CONFIG_MAC80211_DEBUG_COUNTERS
-DEBUGFS_STATS_FILE(tx_handlers_drop, 20, "%u",
-		   local->tx_handlers_drop);
-DEBUGFS_STATS_FILE(tx_handlers_queued, 20, "%u",
-		   local->tx_handlers_queued);
-DEBUGFS_STATS_FILE(tx_handlers_drop_unencrypted, 20, "%u",
-		   local->tx_handlers_drop_unencrypted);
-DEBUGFS_STATS_FILE(tx_handlers_drop_fragment, 20, "%u",
-		   local->tx_handlers_drop_fragment);
-DEBUGFS_STATS_FILE(tx_handlers_drop_wep, 20, "%u",
-		   local->tx_handlers_drop_wep);
-DEBUGFS_STATS_FILE(tx_handlers_drop_not_assoc, 20, "%u",
-		   local->tx_handlers_drop_not_assoc);
-DEBUGFS_STATS_FILE(tx_handlers_drop_unauth_port, 20, "%u",
-		   local->tx_handlers_drop_unauth_port);
-DEBUGFS_STATS_FILE(rx_handlers_drop, 20, "%u",
-		   local->rx_handlers_drop);
-DEBUGFS_STATS_FILE(rx_handlers_queued, 20, "%u",
-		   local->rx_handlers_queued);
-DEBUGFS_STATS_FILE(rx_handlers_drop_nullfunc, 20, "%u",
-		   local->rx_handlers_drop_nullfunc);
-DEBUGFS_STATS_FILE(rx_handlers_drop_defrag, 20, "%u",
-		   local->rx_handlers_drop_defrag);
-DEBUGFS_STATS_FILE(rx_handlers_drop_short, 20, "%u",
-		   local->rx_handlers_drop_short);
-DEBUGFS_STATS_FILE(rx_handlers_drop_passive_scan, 20, "%u",
-		   local->rx_handlers_drop_passive_scan);
-DEBUGFS_STATS_FILE(tx_expand_skb_head, 20, "%u",
-		   local->tx_expand_skb_head);
-DEBUGFS_STATS_FILE(tx_expand_skb_head_cloned, 20, "%u",
-		   local->tx_expand_skb_head_cloned);
-DEBUGFS_STATS_FILE(rx_expand_skb_head, 20, "%u",
-		   local->rx_expand_skb_head);
-DEBUGFS_STATS_FILE(rx_expand_skb_head2, 20, "%u",
-		   local->rx_expand_skb_head2);
-DEBUGFS_STATS_FILE(rx_handlers_fragments, 20, "%u",
-		   local->rx_handlers_fragments);
-DEBUGFS_STATS_FILE(tx_status_drop, 20, "%u",
-		   local->tx_status_drop);
-
-#endif
-
 DEBUGFS_DEVSTATS_FILE(dot11ACKFailureCount);
 DEBUGFS_DEVSTATS_FILE(dot11RTSFailureCount);
 DEBUGFS_DEVSTATS_FILE(dot11FCSErrorCount);
 DEBUGFS_DEVSTATS_FILE(dot11RTSSuccessCount);
 
-
 void debugfs_hw_add(struct ieee80211_local *local)
 {
 	struct dentry *phyd = local->hw.wiphy->debugfsdir;
@@ -428,7 +377,6 @@ void debugfs_hw_add(struct ieee80211_local *local)
 	if (!phyd)
 		return;
 
-	local->debugfs.stations = debugfs_create_dir("stations", phyd);
 	local->debugfs.keys = debugfs_create_dir("keys", phyd);
 
 	DEBUGFS_ADD(frequency);
@@ -448,38 +396,60 @@ void debugfs_hw_add(struct ieee80211_local *local)
 	if (!statsd)
 		return;
 
-	DEBUGFS_STATS_ADD(transmitted_fragment_count);
-	DEBUGFS_STATS_ADD(multicast_transmitted_frame_count);
-	DEBUGFS_STATS_ADD(failed_count);
-	DEBUGFS_STATS_ADD(retry_count);
-	DEBUGFS_STATS_ADD(multiple_retry_count);
-	DEBUGFS_STATS_ADD(frame_duplicate_count);
-	DEBUGFS_STATS_ADD(received_fragment_count);
-	DEBUGFS_STATS_ADD(multicast_received_frame_count);
-	DEBUGFS_STATS_ADD(transmitted_frame_count);
+	DEBUGFS_STATS_ADD(transmitted_fragment_count,
+		local->dot11TransmittedFragmentCount);
+	DEBUGFS_STATS_ADD(multicast_transmitted_frame_count,
+		local->dot11MulticastTransmittedFrameCount);
+	DEBUGFS_STATS_ADD(failed_count, local->dot11FailedCount);
+	DEBUGFS_STATS_ADD(retry_count, local->dot11RetryCount);
+	DEBUGFS_STATS_ADD(multiple_retry_count,
+		local->dot11MultipleRetryCount);
+	DEBUGFS_STATS_ADD(frame_duplicate_count,
+		local->dot11FrameDuplicateCount);
+	DEBUGFS_STATS_ADD(received_fragment_count,
+		local->dot11ReceivedFragmentCount);
+	DEBUGFS_STATS_ADD(multicast_received_frame_count,
+		local->dot11MulticastReceivedFrameCount);
+	DEBUGFS_STATS_ADD(transmitted_frame_count,
+		local->dot11TransmittedFrameCount);
 #ifdef CONFIG_MAC80211_DEBUG_COUNTERS
-	DEBUGFS_STATS_ADD(tx_handlers_drop);
-	DEBUGFS_STATS_ADD(tx_handlers_queued);
-	DEBUGFS_STATS_ADD(tx_handlers_drop_unencrypted);
-	DEBUGFS_STATS_ADD(tx_handlers_drop_fragment);
-	DEBUGFS_STATS_ADD(tx_handlers_drop_wep);
-	DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc);
-	DEBUGFS_STATS_ADD(tx_handlers_drop_unauth_port);
-	DEBUGFS_STATS_ADD(rx_handlers_drop);
-	DEBUGFS_STATS_ADD(rx_handlers_queued);
-	DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc);
-	DEBUGFS_STATS_ADD(rx_handlers_drop_defrag);
-	DEBUGFS_STATS_ADD(rx_handlers_drop_short);
-	DEBUGFS_STATS_ADD(rx_handlers_drop_passive_scan);
-	DEBUGFS_STATS_ADD(tx_expand_skb_head);
-	DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned);
-	DEBUGFS_STATS_ADD(rx_expand_skb_head);
-	DEBUGFS_STATS_ADD(rx_expand_skb_head2);
-	DEBUGFS_STATS_ADD(rx_handlers_fragments);
-	DEBUGFS_STATS_ADD(tx_status_drop);
+	DEBUGFS_STATS_ADD(tx_handlers_drop, local->tx_handlers_drop);
+	DEBUGFS_STATS_ADD(tx_handlers_queued, local->tx_handlers_queued);
+	DEBUGFS_STATS_ADD(tx_handlers_drop_unencrypted,
+		local->tx_handlers_drop_unencrypted);
+	DEBUGFS_STATS_ADD(tx_handlers_drop_fragment,
+		local->tx_handlers_drop_fragment);
+	DEBUGFS_STATS_ADD(tx_handlers_drop_wep,
+		local->tx_handlers_drop_wep);
+	DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc,
+		local->tx_handlers_drop_not_assoc);
+	DEBUGFS_STATS_ADD(tx_handlers_drop_unauth_port,
+		local->tx_handlers_drop_unauth_port);
+	DEBUGFS_STATS_ADD(rx_handlers_drop, local->rx_handlers_drop);
+	DEBUGFS_STATS_ADD(rx_handlers_queued, local->rx_handlers_queued);
+	DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc,
+		local->rx_handlers_drop_nullfunc);
+	DEBUGFS_STATS_ADD(rx_handlers_drop_defrag,
+		local->rx_handlers_drop_defrag);
+	DEBUGFS_STATS_ADD(rx_handlers_drop_short,
+		local->rx_handlers_drop_short);
+	DEBUGFS_STATS_ADD(rx_handlers_drop_passive_scan,
+		local->rx_handlers_drop_passive_scan);
+	DEBUGFS_STATS_ADD(tx_expand_skb_head,
+		local->tx_expand_skb_head);
+	DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned,
+		local->tx_expand_skb_head_cloned);
+	DEBUGFS_STATS_ADD(rx_expand_skb_head,
+		local->rx_expand_skb_head);
+	DEBUGFS_STATS_ADD(rx_expand_skb_head2,
+		local->rx_expand_skb_head2);
+	DEBUGFS_STATS_ADD(rx_handlers_fragments,
+		local->rx_handlers_fragments);
+	DEBUGFS_STATS_ADD(tx_status_drop,
+		local->tx_status_drop);
 #endif
-	DEBUGFS_STATS_ADD(dot11ACKFailureCount);
-	DEBUGFS_STATS_ADD(dot11RTSFailureCount);
-	DEBUGFS_STATS_ADD(dot11FCSErrorCount);
-	DEBUGFS_STATS_ADD(dot11RTSSuccessCount);
+	DEBUGFS_DEVSTATS_ADD(dot11ACKFailureCount);
+	DEBUGFS_DEVSTATS_ADD(dot11RTSFailureCount);
+	DEBUGFS_DEVSTATS_ADD(dot11FCSErrorCount);
+	DEBUGFS_DEVSTATS_ADD(dot11RTSSuccessCount);
 }
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index 97c9e46e859..1243d1db5c5 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -32,6 +32,7 @@ static ssize_t key_##name##_read(struct file *file,			\
 static const struct file_operations key_ ##name## _ops = {		\
 	.read = key_##name##_read,					\
 	.open = mac80211_open_file_generic,				\
+	.llseek = generic_file_llseek,					\
 }
 
 #define KEY_FILE(name, format)						\
@@ -46,6 +47,7 @@ static const struct file_operations key_ ##name## _ops = {		\
 static const struct file_operations key_ ##name## _ops = {		\
 	.read = key_conf_##name##_read,					\
 	.open = mac80211_open_file_generic,				\
+	.llseek = generic_file_llseek,					\
 }
 
 #define KEY_CONF_FILE(name, format)					\
@@ -64,26 +66,13 @@ static ssize_t key_algorithm_read(struct file *file,
 				  char __user *userbuf,
 				  size_t count, loff_t *ppos)
 {
-	char *alg;
+	char buf[15];
 	struct ieee80211_key *key = file->private_data;
+	u32 c = key->conf.cipher;
 
-	switch (key->conf.alg) {
-	case ALG_WEP:
-		alg = "WEP\n";
-		break;
-	case ALG_TKIP:
-		alg = "TKIP\n";
-		break;
-	case ALG_CCMP:
-		alg = "CCMP\n";
-		break;
-	case ALG_AES_CMAC:
-		alg = "AES-128-CMAC\n";
-		break;
-	default:
-		return 0;
-	}
-	return simple_read_from_buffer(userbuf, count, ppos, alg, strlen(alg));
+	sprintf(buf, "%.2x-%.2x-%.2x:%d\n",
+		c >> 24, (c >> 16) & 0xff, (c >> 8) & 0xff, c & 0xff);
+	return simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
 }
 KEY_OPS(algorithm);
 
@@ -95,21 +84,22 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf,
 	int len;
 	struct ieee80211_key *key = file->private_data;
 
-	switch (key->conf.alg) {
-	case ALG_WEP:
+	switch (key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_WEP40:
+	case WLAN_CIPHER_SUITE_WEP104:
 		len = scnprintf(buf, sizeof(buf), "\n");
 		break;
-	case ALG_TKIP:
+	case WLAN_CIPHER_SUITE_TKIP:
 		len = scnprintf(buf, sizeof(buf), "%08x %04x\n",
 				key->u.tkip.tx.iv32,
 				key->u.tkip.tx.iv16);
 		break;
-	case ALG_CCMP:
+	case WLAN_CIPHER_SUITE_CCMP:
 		tpn = key->u.ccmp.tx_pn;
 		len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
 				tpn[0], tpn[1], tpn[2], tpn[3], tpn[4], tpn[5]);
 		break;
-	case ALG_AES_CMAC:
+	case WLAN_CIPHER_SUITE_AES_CMAC:
 		tpn = key->u.aes_cmac.tx_pn;
 		len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
 				tpn[0], tpn[1], tpn[2], tpn[3], tpn[4],
@@ -130,11 +120,12 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf,
 	int i, len;
 	const u8 *rpn;
 
-	switch (key->conf.alg) {
-	case ALG_WEP:
+	switch (key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_WEP40:
+	case WLAN_CIPHER_SUITE_WEP104:
 		len = scnprintf(buf, sizeof(buf), "\n");
 		break;
-	case ALG_TKIP:
+	case WLAN_CIPHER_SUITE_TKIP:
 		for (i = 0; i < NUM_RX_DATA_QUEUES; i++)
 			p += scnprintf(p, sizeof(buf)+buf-p,
 				       "%08x %04x\n",
@@ -142,8 +133,8 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf,
 				       key->u.tkip.rx[i].iv16);
 		len = p - buf;
 		break;
-	case ALG_CCMP:
-		for (i = 0; i < NUM_RX_DATA_QUEUES; i++) {
+	case WLAN_CIPHER_SUITE_CCMP:
+		for (i = 0; i < NUM_RX_DATA_QUEUES + 1; i++) {
 			rpn = key->u.ccmp.rx_pn[i];
 			p += scnprintf(p, sizeof(buf)+buf-p,
 				       "%02x%02x%02x%02x%02x%02x\n",
@@ -152,7 +143,7 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf,
 		}
 		len = p - buf;
 		break;
-	case ALG_AES_CMAC:
+	case WLAN_CIPHER_SUITE_AES_CMAC:
 		rpn = key->u.aes_cmac.rx_pn;
 		p += scnprintf(p, sizeof(buf)+buf-p,
 			       "%02x%02x%02x%02x%02x%02x\n",
@@ -174,11 +165,11 @@ static ssize_t key_replays_read(struct file *file, char __user *userbuf,
 	char buf[20];
 	int len;
 
-	switch (key->conf.alg) {
-	case ALG_CCMP:
+	switch (key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_CCMP:
 		len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays);
 		break;
-	case ALG_AES_CMAC:
+	case WLAN_CIPHER_SUITE_AES_CMAC:
 		len = scnprintf(buf, sizeof(buf), "%u\n",
 				key->u.aes_cmac.replays);
 		break;
@@ -196,8 +187,8 @@ static ssize_t key_icverrors_read(struct file *file, char __user *userbuf,
 	char buf[20];
 	int len;
 
-	switch (key->conf.alg) {
-	case ALG_AES_CMAC:
+	switch (key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_AES_CMAC:
 		len = scnprintf(buf, sizeof(buf), "%u\n",
 				key->u.aes_cmac.icverrors);
 		break;
@@ -212,9 +203,13 @@ static ssize_t key_key_read(struct file *file, char __user *userbuf,
 			    size_t count, loff_t *ppos)
 {
 	struct ieee80211_key *key = file->private_data;
-	int i, res, bufsize = 2 * key->conf.keylen + 2;
+	int i, bufsize = 2 * key->conf.keylen + 2;
 	char *buf = kmalloc(bufsize, GFP_KERNEL);
 	char *p = buf;
+	ssize_t res;
+
+	if (!buf)
+		return -ENOMEM;
 
 	for (i = 0; i < key->conf.keylen; i++)
 		p += scnprintf(p, bufsize + buf - p, "%02x", key->conf.key[i]);
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 20b2998fa0e..cbdf36d7841 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -121,6 +121,7 @@ static const struct file_operations name##_ops = {			\
 	.read = ieee80211_if_read_##name,				\
 	.write = (_write),						\
 	.open = mac80211_open_file_generic,				\
+	.llseek = generic_file_llseek,					\
 }
 
 #define __IEEE80211_IF_FILE_W(name)					\
@@ -409,6 +410,9 @@ void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata)
 	sprintf(buf, "netdev:%s", sdata->name);
 	sdata->debugfs.dir = debugfs_create_dir(buf,
 		sdata->local->hw.wiphy->debugfsdir);
+	if (sdata->debugfs.dir)
+		sdata->debugfs.subdir_stations = debugfs_create_dir("stations",
+			sdata->debugfs.dir);
 	add_files(sdata);
 }
 
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index e763f1529dd..4601fea1784 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -30,13 +30,13 @@ static ssize_t sta_ ##name## _read(struct file *file,			\
 }
 #define STA_READ_D(name, field) STA_READ(name, 20, field, "%d\n")
 #define STA_READ_U(name, field) STA_READ(name, 20, field, "%u\n")
-#define STA_READ_LU(name, field) STA_READ(name, 20, field, "%lu\n")
 #define STA_READ_S(name, field) STA_READ(name, 20, field, "%s\n")
 
 #define STA_OPS(name)							\
 static const struct file_operations sta_ ##name## _ops = {		\
 	.read = sta_##name##_read,					\
 	.open = mac80211_open_file_generic,				\
+	.llseek = generic_file_llseek,					\
 }
 
 #define STA_OPS_RW(name)						\
@@ -44,6 +44,7 @@ static const struct file_operations sta_ ##name## _ops = {		\
 	.read = sta_##name##_read,					\
 	.write = sta_##name##_write,					\
 	.open = mac80211_open_file_generic,				\
+	.llseek = generic_file_llseek,					\
 }
 
 #define STA_FILE(name, field, format)					\
@@ -52,19 +53,7 @@ static const struct file_operations sta_ ##name## _ops = {		\
 
 STA_FILE(aid, sta.aid, D);
 STA_FILE(dev, sdata->name, S);
-STA_FILE(rx_packets, rx_packets, LU);
-STA_FILE(tx_packets, tx_packets, LU);
-STA_FILE(rx_bytes, rx_bytes, LU);
-STA_FILE(tx_bytes, tx_bytes, LU);
-STA_FILE(rx_duplicates, num_duplicates, LU);
-STA_FILE(rx_fragments, rx_fragments, LU);
-STA_FILE(rx_dropped, rx_dropped, LU);
-STA_FILE(tx_fragments, tx_fragments, LU);
-STA_FILE(tx_filtered, tx_filtered_count, LU);
-STA_FILE(tx_retry_failed, tx_retry_failed, LU);
-STA_FILE(tx_retry_count, tx_retry_count, LU);
 STA_FILE(last_signal, last_signal, D);
-STA_FILE(wep_weak_iv_count, wep_weak_iv_count, LU);
 
 static ssize_t sta_flags_read(struct file *file, char __user *userbuf,
 			      size_t count, loff_t *ppos)
@@ -134,28 +123,25 @@ static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf,
 	p += scnprintf(p, sizeof(buf) + buf - p, "next dialog_token: %#02x\n",
 			sta->ampdu_mlme.dialog_token_allocator + 1);
 	p += scnprintf(p, sizeof(buf) + buf - p,
-		       "TID\t\tRX active\tDTKN\tSSN\t\tTX\tDTKN\tSSN\tpending\n");
+		       "TID\t\tRX active\tDTKN\tSSN\t\tTX\tDTKN\tpending\n");
 	for (i = 0; i < STA_TID_NUM; i++) {
 		p += scnprintf(p, sizeof(buf) + buf - p, "%02d", i);
 		p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x",
-				sta->ampdu_mlme.tid_active_rx[i]);
+				!!sta->ampdu_mlme.tid_rx[i]);
 		p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.2x",
-				sta->ampdu_mlme.tid_active_rx[i] ?
+				sta->ampdu_mlme.tid_rx[i] ?
 				sta->ampdu_mlme.tid_rx[i]->dialog_token : 0);
 		p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.3x",
-				sta->ampdu_mlme.tid_active_rx[i] ?
+				sta->ampdu_mlme.tid_rx[i] ?
 				sta->ampdu_mlme.tid_rx[i]->ssn : 0);
 
 		p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x",
-				sta->ampdu_mlme.tid_state_tx[i]);
+				!!sta->ampdu_mlme.tid_tx[i]);
 		p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.2x",
-				sta->ampdu_mlme.tid_state_tx[i] ?
+				sta->ampdu_mlme.tid_tx[i] ?
 				sta->ampdu_mlme.tid_tx[i]->dialog_token : 0);
-		p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.3x",
-				sta->ampdu_mlme.tid_state_tx[i] ?
-				sta->ampdu_mlme.tid_tx[i]->ssn : 0);
 		p += scnprintf(p, sizeof(buf) + buf - p, "\t%03d",
-				sta->ampdu_mlme.tid_state_tx[i] ?
+				sta->ampdu_mlme.tid_tx[i] ?
 				skb_queue_len(&sta->ampdu_mlme.tid_tx[i]->pending) : 0);
 		p += scnprintf(p, sizeof(buf) + buf - p, "\n");
 	}
@@ -210,10 +196,10 @@ static ssize_t sta_agg_status_write(struct file *file, const char __user *userbu
 		if (start)
 			ret = ieee80211_start_tx_ba_session(&sta->sta, tid);
 		else
-			ret = ieee80211_stop_tx_ba_session(&sta->sta, tid,
-							   WLAN_BACK_RECIPIENT);
+			ret = ieee80211_stop_tx_ba_session(&sta->sta, tid);
 	} else {
-		__ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT, 3);
+		__ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT,
+					       3, true);
 		ret = 0;
 	}
 
@@ -307,10 +293,17 @@ STA_OPS(ht_capa);
 	debugfs_create_file(#name, 0400, \
 		sta->debugfs.dir, sta, &sta_ ##name## _ops);
 
+#define DEBUGFS_ADD_COUNTER(name, field)				\
+	if (sizeof(sta->field) == sizeof(u32))				\
+		debugfs_create_u32(#name, 0400, sta->debugfs.dir,	\
+			(u32 *) &sta->field);				\
+	else								\
+		debugfs_create_u64(#name, 0400, sta->debugfs.dir,	\
+			(u64 *) &sta->field);
 
 void ieee80211_sta_debugfs_add(struct sta_info *sta)
 {
-	struct dentry *stations_dir = sta->local->debugfs.stations;
+	struct dentry *stations_dir = sta->sdata->debugfs.subdir_stations;
 	u8 mac[3*ETH_ALEN];
 
 	sta->debugfs.add_has_run = true;
@@ -339,20 +332,21 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
 	DEBUGFS_ADD(last_seq_ctrl);
 	DEBUGFS_ADD(agg_status);
 	DEBUGFS_ADD(dev);
-	DEBUGFS_ADD(rx_packets);
-	DEBUGFS_ADD(tx_packets);
-	DEBUGFS_ADD(rx_bytes);
-	DEBUGFS_ADD(tx_bytes);
-	DEBUGFS_ADD(rx_duplicates);
-	DEBUGFS_ADD(rx_fragments);
-	DEBUGFS_ADD(rx_dropped);
-	DEBUGFS_ADD(tx_fragments);
-	DEBUGFS_ADD(tx_filtered);
-	DEBUGFS_ADD(tx_retry_failed);
-	DEBUGFS_ADD(tx_retry_count);
 	DEBUGFS_ADD(last_signal);
-	DEBUGFS_ADD(wep_weak_iv_count);
 	DEBUGFS_ADD(ht_capa);
+
+	DEBUGFS_ADD_COUNTER(rx_packets, rx_packets);
+	DEBUGFS_ADD_COUNTER(tx_packets, tx_packets);
+	DEBUGFS_ADD_COUNTER(rx_bytes, rx_bytes);
+	DEBUGFS_ADD_COUNTER(tx_bytes, tx_bytes);
+	DEBUGFS_ADD_COUNTER(rx_duplicates, num_duplicates);
+	DEBUGFS_ADD_COUNTER(rx_fragments, rx_fragments);
+	DEBUGFS_ADD_COUNTER(rx_dropped, rx_dropped);
+	DEBUGFS_ADD_COUNTER(tx_fragments, tx_fragments);
+	DEBUGFS_ADD_COUNTER(tx_filtered, tx_filtered_count);
+	DEBUGFS_ADD_COUNTER(tx_retry_failed, tx_retry_failed);
+	DEBUGFS_ADD_COUNTER(tx_retry_count, tx_retry_count);
+	DEBUGFS_ADD_COUNTER(wep_weak_iv_count, wep_weak_iv_count);
 }
 
 void ieee80211_sta_debugfs_remove(struct sta_info *sta)
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 9c1da080916..16983825f8e 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -16,10 +16,11 @@ static inline int drv_start(struct ieee80211_local *local)
 
 	might_sleep();
 
+	trace_drv_start(local);
 	local->started = true;
 	smp_mb();
 	ret = local->ops->start(&local->hw);
-	trace_drv_start(local, ret);
+	trace_drv_return_int(local, ret);
 	return ret;
 }
 
@@ -27,8 +28,9 @@ static inline void drv_stop(struct ieee80211_local *local)
 {
 	might_sleep();
 
-	local->ops->stop(&local->hw);
 	trace_drv_stop(local);
+	local->ops->stop(&local->hw);
+	trace_drv_return_void(local);
 
 	/* sync away all work on the tasklet before clearing started */
 	tasklet_disable(&local->tasklet);
@@ -46,8 +48,23 @@ static inline int drv_add_interface(struct ieee80211_local *local,
 
 	might_sleep();
 
+	trace_drv_add_interface(local, vif_to_sdata(vif));
 	ret = local->ops->add_interface(&local->hw, vif);
-	trace_drv_add_interface(local, vif_to_sdata(vif), ret);
+	trace_drv_return_int(local, ret);
+	return ret;
+}
+
+static inline int drv_change_interface(struct ieee80211_local *local,
+				       struct ieee80211_sub_if_data *sdata,
+				       enum nl80211_iftype type, bool p2p)
+{
+	int ret;
+
+	might_sleep();
+
+	trace_drv_change_interface(local, sdata, type, p2p);
+	ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p);
+	trace_drv_return_int(local, ret);
 	return ret;
 }
 
@@ -56,8 +73,9 @@ static inline void drv_remove_interface(struct ieee80211_local *local,
 {
 	might_sleep();
 
-	local->ops->remove_interface(&local->hw, vif);
 	trace_drv_remove_interface(local, vif_to_sdata(vif));
+	local->ops->remove_interface(&local->hw, vif);
+	trace_drv_return_void(local);
 }
 
 static inline int drv_config(struct ieee80211_local *local, u32 changed)
@@ -66,8 +84,9 @@ static inline int drv_config(struct ieee80211_local *local, u32 changed)
 
 	might_sleep();
 
+	trace_drv_config(local, changed);
 	ret = local->ops->config(&local->hw, changed);
-	trace_drv_config(local, changed, ret);
+	trace_drv_return_int(local, ret);
 	return ret;
 }
 
@@ -78,9 +97,10 @@ static inline void drv_bss_info_changed(struct ieee80211_local *local,
 {
 	might_sleep();
 
+	trace_drv_bss_info_changed(local, sdata, info, changed);
 	if (local->ops->bss_info_changed)
 		local->ops->bss_info_changed(&local->hw, &sdata->vif, info, changed);
-	trace_drv_bss_info_changed(local, sdata, info, changed);
+	trace_drv_return_void(local);
 }
 
 static inline u64 drv_prepare_multicast(struct ieee80211_local *local,
@@ -88,10 +108,12 @@ static inline u64 drv_prepare_multicast(struct ieee80211_local *local,
 {
 	u64 ret = 0;
 
+	trace_drv_prepare_multicast(local, mc_list->count);
+
 	if (local->ops->prepare_multicast)
 		ret = local->ops->prepare_multicast(&local->hw, mc_list);
 
-	trace_drv_prepare_multicast(local, mc_list->count, ret);
+	trace_drv_return_u64(local, ret);
 
 	return ret;
 }
@@ -103,19 +125,21 @@ static inline void drv_configure_filter(struct ieee80211_local *local,
 {
 	might_sleep();
 
-	local->ops->configure_filter(&local->hw, changed_flags, total_flags,
-				     multicast);
 	trace_drv_configure_filter(local, changed_flags, total_flags,
 				   multicast);
+	local->ops->configure_filter(&local->hw, changed_flags, total_flags,
+				     multicast);
+	trace_drv_return_void(local);
 }
 
 static inline int drv_set_tim(struct ieee80211_local *local,
 			      struct ieee80211_sta *sta, bool set)
 {
 	int ret = 0;
+	trace_drv_set_tim(local, sta, set);
 	if (local->ops->set_tim)
 		ret = local->ops->set_tim(&local->hw, sta, set);
-	trace_drv_set_tim(local, sta, set, ret);
+	trace_drv_return_int(local, ret);
 	return ret;
 }
 
@@ -129,8 +153,9 @@ static inline int drv_set_key(struct ieee80211_local *local,
 
 	might_sleep();
 
+	trace_drv_set_key(local, cmd, sdata, sta, key);
 	ret = local->ops->set_key(&local->hw, cmd, &sdata->vif, sta, key);
-	trace_drv_set_key(local, cmd, sdata, sta, key, ret);
+	trace_drv_return_int(local, ret);
 	return ret;
 }
 
@@ -145,10 +170,11 @@ static inline void drv_update_tkip_key(struct ieee80211_local *local,
 	if (sta)
 		ista = &sta->sta;
 
+	trace_drv_update_tkip_key(local, sdata, conf, ista, iv32);
 	if (local->ops->update_tkip_key)
 		local->ops->update_tkip_key(&local->hw, &sdata->vif, conf,
 					    ista, iv32, phase1key);
-	trace_drv_update_tkip_key(local, sdata, conf, ista, iv32);
+	trace_drv_return_void(local);
 }
 
 static inline int drv_hw_scan(struct ieee80211_local *local,
@@ -159,8 +185,9 @@ static inline int drv_hw_scan(struct ieee80211_local *local,
 
 	might_sleep();
 
+	trace_drv_hw_scan(local, sdata, req);
 	ret = local->ops->hw_scan(&local->hw, &sdata->vif, req);
-	trace_drv_hw_scan(local, sdata, req, ret);
+	trace_drv_return_int(local, ret);
 	return ret;
 }
 
@@ -168,18 +195,20 @@ static inline void drv_sw_scan_start(struct ieee80211_local *local)
 {
 	might_sleep();
 
+	trace_drv_sw_scan_start(local);
 	if (local->ops->sw_scan_start)
 		local->ops->sw_scan_start(&local->hw);
-	trace_drv_sw_scan_start(local);
+	trace_drv_return_void(local);
 }
 
 static inline void drv_sw_scan_complete(struct ieee80211_local *local)
 {
 	might_sleep();
 
+	trace_drv_sw_scan_complete(local);
 	if (local->ops->sw_scan_complete)
 		local->ops->sw_scan_complete(&local->hw);
-	trace_drv_sw_scan_complete(local);
+	trace_drv_return_void(local);
 }
 
 static inline int drv_get_stats(struct ieee80211_local *local,
@@ -211,9 +240,10 @@ static inline int drv_set_rts_threshold(struct ieee80211_local *local,
 
 	might_sleep();
 
+	trace_drv_set_rts_threshold(local, value);
 	if (local->ops->set_rts_threshold)
 		ret = local->ops->set_rts_threshold(&local->hw, value);
-	trace_drv_set_rts_threshold(local, value, ret);
+	trace_drv_return_int(local, ret);
 	return ret;
 }
 
@@ -223,12 +253,13 @@ static inline int drv_set_coverage_class(struct ieee80211_local *local,
 	int ret = 0;
 	might_sleep();
 
+	trace_drv_set_coverage_class(local, value);
 	if (local->ops->set_coverage_class)
 		local->ops->set_coverage_class(&local->hw, value);
 	else
 		ret = -EOPNOTSUPP;
 
-	trace_drv_set_coverage_class(local, value, ret);
+	trace_drv_return_int(local, ret);
 	return ret;
 }
 
@@ -237,9 +268,10 @@ static inline void drv_sta_notify(struct ieee80211_local *local,
 				  enum sta_notify_cmd cmd,
 				  struct ieee80211_sta *sta)
 {
+	trace_drv_sta_notify(local, sdata, cmd, sta);
 	if (local->ops->sta_notify)
 		local->ops->sta_notify(&local->hw, &sdata->vif, cmd, sta);
-	trace_drv_sta_notify(local, sdata, cmd, sta);
+	trace_drv_return_void(local);
 }
 
 static inline int drv_sta_add(struct ieee80211_local *local,
@@ -250,13 +282,11 @@ static inline int drv_sta_add(struct ieee80211_local *local,
 
 	might_sleep();
 
+	trace_drv_sta_add(local, sdata, sta);
 	if (local->ops->sta_add)
 		ret = local->ops->sta_add(&local->hw, &sdata->vif, sta);
-	else if (local->ops->sta_notify)
-		local->ops->sta_notify(&local->hw, &sdata->vif,
-					STA_NOTIFY_ADD, sta);
 
-	trace_drv_sta_add(local, sdata, sta, ret);
+	trace_drv_return_int(local, ret);
 
 	return ret;
 }
@@ -267,13 +297,11 @@ static inline void drv_sta_remove(struct ieee80211_local *local,
 {
 	might_sleep();
 
+	trace_drv_sta_remove(local, sdata, sta);
 	if (local->ops->sta_remove)
 		local->ops->sta_remove(&local->hw, &sdata->vif, sta);
-	else if (local->ops->sta_notify)
-		local->ops->sta_notify(&local->hw, &sdata->vif,
-					STA_NOTIFY_REMOVE, sta);
 
-	trace_drv_sta_remove(local, sdata, sta);
+	trace_drv_return_void(local);
 }
 
 static inline int drv_conf_tx(struct ieee80211_local *local, u16 queue,
@@ -283,9 +311,10 @@ static inline int drv_conf_tx(struct ieee80211_local *local, u16 queue,
 
 	might_sleep();
 
+	trace_drv_conf_tx(local, queue, params);
 	if (local->ops->conf_tx)
 		ret = local->ops->conf_tx(&local->hw, queue, params);
-	trace_drv_conf_tx(local, queue, params, ret);
+	trace_drv_return_int(local, ret);
 	return ret;
 }
 
@@ -295,9 +324,10 @@ static inline u64 drv_get_tsf(struct ieee80211_local *local)
 
 	might_sleep();
 
+	trace_drv_get_tsf(local);
 	if (local->ops->get_tsf)
 		ret = local->ops->get_tsf(&local->hw);
-	trace_drv_get_tsf(local, ret);
+	trace_drv_return_u64(local, ret);
 	return ret;
 }
 
@@ -305,18 +335,20 @@ static inline void drv_set_tsf(struct ieee80211_local *local, u64 tsf)
 {
 	might_sleep();
 
+	trace_drv_set_tsf(local, tsf);
 	if (local->ops->set_tsf)
 		local->ops->set_tsf(&local->hw, tsf);
-	trace_drv_set_tsf(local, tsf);
+	trace_drv_return_void(local);
 }
 
 static inline void drv_reset_tsf(struct ieee80211_local *local)
 {
 	might_sleep();
 
+	trace_drv_reset_tsf(local);
 	if (local->ops->reset_tsf)
 		local->ops->reset_tsf(&local->hw);
-	trace_drv_reset_tsf(local);
+	trace_drv_return_void(local);
 }
 
 static inline int drv_tx_last_beacon(struct ieee80211_local *local)
@@ -325,9 +357,10 @@ static inline int drv_tx_last_beacon(struct ieee80211_local *local)
 
 	might_sleep();
 
+	trace_drv_tx_last_beacon(local);
 	if (local->ops->tx_last_beacon)
 		ret = local->ops->tx_last_beacon(&local->hw);
-	trace_drv_tx_last_beacon(local, ret);
+	trace_drv_return_int(local, ret);
 	return ret;
 }
 
@@ -338,10 +371,17 @@ static inline int drv_ampdu_action(struct ieee80211_local *local,
 				   u16 *ssn)
 {
 	int ret = -EOPNOTSUPP;
+
+	might_sleep();
+
+	trace_drv_ampdu_action(local, sdata, action, sta, tid, ssn);
+
 	if (local->ops->ampdu_action)
 		ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action,
 					       sta, tid, ssn);
-	trace_drv_ampdu_action(local, sdata, action, sta, tid, ssn, ret);
+
+	trace_drv_return_int(local, ret);
+
 	return ret;
 }
 
@@ -349,9 +389,14 @@ static inline int drv_get_survey(struct ieee80211_local *local, int idx,
 				struct survey_info *survey)
 {
 	int ret = -EOPNOTSUPP;
+
+	trace_drv_get_survey(local, idx, survey);
+
 	if (local->ops->get_survey)
 		ret = local->ops->get_survey(&local->hw, idx, survey);
-	/* trace_drv_get_survey(local, idx, survey, ret); */
+
+	trace_drv_return_int(local, ret);
+
 	return ret;
 }
 
@@ -370,6 +415,7 @@ static inline void drv_flush(struct ieee80211_local *local, bool drop)
 	trace_drv_flush(local, drop);
 	if (local->ops->flush)
 		local->ops->flush(&local->hw, drop);
+	trace_drv_return_void(local);
 }
 
 static inline void drv_channel_switch(struct ieee80211_local *local,
@@ -377,9 +423,9 @@ static inline void drv_channel_switch(struct ieee80211_local *local,
 {
 	might_sleep();
 
-	local->ops->channel_switch(&local->hw, ch_switch);
-
 	trace_drv_channel_switch(local, ch_switch);
+	local->ops->channel_switch(&local->hw, ch_switch);
+	trace_drv_return_void(local);
 }
 
 #endif /* __MAC80211_DRIVER_OPS */
diff --git a/net/mac80211/driver-trace.h b/net/mac80211/driver-trace.h
index 6a9b2342a9c..6831fb1641c 100644
--- a/net/mac80211/driver-trace.h
+++ b/net/mac80211/driver-trace.h
@@ -25,31 +25,71 @@ static inline void trace_ ## name(proto) {}
 #define STA_PR_FMT	" sta:%pM"
 #define STA_PR_ARG	__entry->sta_addr
 
-#define VIF_ENTRY	__field(enum nl80211_iftype, vif_type) __field(void *, sdata) \
+#define VIF_ENTRY	__field(enum nl80211_iftype, vif_type) __field(void *, sdata)	\
+			__field(bool, p2p)						\
 			__string(vif_name, sdata->dev ? sdata->dev->name : "<nodev>")
-#define VIF_ASSIGN	__entry->vif_type = sdata->vif.type; __entry->sdata = sdata; \
+#define VIF_ASSIGN	__entry->vif_type = sdata->vif.type; __entry->sdata = sdata;	\
+			__entry->p2p = sdata->vif.p2p;					\
 			__assign_str(vif_name, sdata->dev ? sdata->dev->name : "<nodev>")
-#define VIF_PR_FMT	" vif:%s(%d)"
-#define VIF_PR_ARG	__get_str(vif_name), __entry->vif_type
+#define VIF_PR_FMT	" vif:%s(%d%s)"
+#define VIF_PR_ARG	__get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : ""
 
 /*
  * Tracing for driver callbacks.
  */
 
-TRACE_EVENT(drv_start,
-	TP_PROTO(struct ieee80211_local *local, int ret),
+TRACE_EVENT(drv_return_void,
+	TP_PROTO(struct ieee80211_local *local),
+	TP_ARGS(local),
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+	),
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+	),
+	TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG)
+);
 
+TRACE_EVENT(drv_return_int,
+	TP_PROTO(struct ieee80211_local *local, int ret),
 	TP_ARGS(local, ret),
-
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
 		__field(int, ret)
 	),
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->ret = ret;
+	),
+	TP_printk(LOCAL_PR_FMT " - %d", LOCAL_PR_ARG, __entry->ret)
+);
 
+TRACE_EVENT(drv_return_u64,
+	TP_PROTO(struct ieee80211_local *local, u64 ret),
+	TP_ARGS(local, ret),
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(u64, ret)
+	),
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		__entry->ret = ret;
 	),
+	TP_printk(LOCAL_PR_FMT " - %llu", LOCAL_PR_ARG, __entry->ret)
+);
+
+TRACE_EVENT(drv_start,
+	TP_PROTO(struct ieee80211_local *local),
+
+	TP_ARGS(local),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+	),
 
 	TP_printk(
 		LOCAL_PR_FMT, LOCAL_PR_ARG
@@ -76,28 +116,53 @@ TRACE_EVENT(drv_stop,
 
 TRACE_EVENT(drv_add_interface,
 	TP_PROTO(struct ieee80211_local *local,
-		 struct ieee80211_sub_if_data *sdata,
-		 int ret),
+		 struct ieee80211_sub_if_data *sdata),
 
-	TP_ARGS(local, sdata, ret),
+	TP_ARGS(local, sdata),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
 		VIF_ENTRY
 		__array(char, addr, 6)
-		__field(int, ret)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		memcpy(__entry->addr, sdata->vif.addr, 6);
-		__entry->ret = ret;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT  VIF_PR_FMT " addr:%pM ret:%d",
-		LOCAL_PR_ARG, VIF_PR_ARG, __entry->addr, __entry->ret
+		LOCAL_PR_FMT  VIF_PR_FMT " addr:%pM",
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->addr
+	)
+);
+
+TRACE_EVENT(drv_change_interface,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 enum nl80211_iftype type, bool p2p),
+
+	TP_ARGS(local, sdata, type, p2p),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		__field(u32, new_type)
+		__field(bool, new_p2p)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		__entry->new_type = type;
+		__entry->new_p2p = p2p;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT  VIF_PR_FMT " new type:%d%s",
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->new_type,
+		__entry->new_p2p ? "/p2p" : ""
 	)
 );
 
@@ -126,15 +191,13 @@ TRACE_EVENT(drv_remove_interface,
 
 TRACE_EVENT(drv_config,
 	TP_PROTO(struct ieee80211_local *local,
-		 u32 changed,
-		 int ret),
+		 u32 changed),
 
-	TP_ARGS(local, changed, ret),
+	TP_ARGS(local, changed),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
 		__field(u32, changed)
-		__field(int, ret)
 		__field(u32, flags)
 		__field(int, power_level)
 		__field(int, dynamic_ps_timeout)
@@ -150,7 +213,6 @@ TRACE_EVENT(drv_config,
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		__entry->changed = changed;
-		__entry->ret = ret;
 		__entry->flags = local->hw.conf.flags;
 		__entry->power_level = local->hw.conf.power_level;
 		__entry->dynamic_ps_timeout = local->hw.conf.dynamic_ps_timeout;
@@ -164,8 +226,8 @@ TRACE_EVENT(drv_config,
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT " ch:%#x freq:%d ret:%d",
-		LOCAL_PR_ARG, __entry->changed, __entry->center_freq, __entry->ret
+		LOCAL_PR_FMT " ch:%#x freq:%d",
+		LOCAL_PR_ARG, __entry->changed, __entry->center_freq
 	)
 );
 
@@ -220,26 +282,23 @@ TRACE_EVENT(drv_bss_info_changed,
 );
 
 TRACE_EVENT(drv_prepare_multicast,
-	TP_PROTO(struct ieee80211_local *local, int mc_count, u64 ret),
+	TP_PROTO(struct ieee80211_local *local, int mc_count),
 
-	TP_ARGS(local, mc_count, ret),
+	TP_ARGS(local, mc_count),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
 		__field(int, mc_count)
-		__field(u64, ret)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		__entry->mc_count = mc_count;
-		__entry->ret = ret;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT " prepare mc (%d): %llx",
-		LOCAL_PR_ARG, __entry->mc_count,
-		(unsigned long long) __entry->ret
+		LOCAL_PR_FMT " prepare mc (%d)",
+		LOCAL_PR_ARG, __entry->mc_count
 	)
 );
 
@@ -273,27 +332,25 @@ TRACE_EVENT(drv_configure_filter,
 
 TRACE_EVENT(drv_set_tim,
 	TP_PROTO(struct ieee80211_local *local,
-		 struct ieee80211_sta *sta, bool set, int ret),
+		 struct ieee80211_sta *sta, bool set),
 
-	TP_ARGS(local, sta, set, ret),
+	TP_ARGS(local, sta, set),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
 		STA_ENTRY
 		__field(bool, set)
-		__field(int, ret)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		STA_ASSIGN;
 		__entry->set = set;
-		__entry->ret = ret;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT STA_PR_FMT " set:%d ret:%d",
-		LOCAL_PR_ARG, STA_PR_FMT, __entry->set, __entry->ret
+		LOCAL_PR_FMT STA_PR_FMT " set:%d",
+		LOCAL_PR_ARG, STA_PR_FMT, __entry->set
 	)
 );
 
@@ -301,35 +358,33 @@ TRACE_EVENT(drv_set_key,
 	TP_PROTO(struct ieee80211_local *local,
 		 enum set_key_cmd cmd, struct ieee80211_sub_if_data *sdata,
 		 struct ieee80211_sta *sta,
-		 struct ieee80211_key_conf *key, int ret),
+		 struct ieee80211_key_conf *key),
 
-	TP_ARGS(local, cmd, sdata, sta, key, ret),
+	TP_ARGS(local, cmd, sdata, sta, key),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
 		VIF_ENTRY
 		STA_ENTRY
-		__field(enum ieee80211_key_alg, alg)
+		__field(u32, cipher)
 		__field(u8, hw_key_idx)
 		__field(u8, flags)
 		__field(s8, keyidx)
-		__field(int, ret)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		STA_ASSIGN;
-		__entry->alg = key->alg;
+		__entry->cipher = key->cipher;
 		__entry->flags = key->flags;
 		__entry->keyidx = key->keyidx;
 		__entry->hw_key_idx = key->hw_key_idx;
-		__entry->ret = ret;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " ret:%d",
-		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->ret
+		LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT,
+		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG
 	)
 );
 
@@ -364,25 +419,23 @@ TRACE_EVENT(drv_update_tkip_key,
 TRACE_EVENT(drv_hw_scan,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-		 struct cfg80211_scan_request *req, int ret),
+		 struct cfg80211_scan_request *req),
 
-	TP_ARGS(local, sdata, req, ret),
+	TP_ARGS(local, sdata, req),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
 		VIF_ENTRY
-		__field(int, ret)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
-		__entry->ret = ret;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT VIF_PR_FMT " ret:%d",
-		LOCAL_PR_ARG,VIF_PR_ARG, __entry->ret
+		LOCAL_PR_FMT VIF_PR_FMT,
+		LOCAL_PR_ARG,VIF_PR_ARG
 	)
 );
 
@@ -479,48 +532,44 @@ TRACE_EVENT(drv_get_tkip_seq,
 );
 
 TRACE_EVENT(drv_set_rts_threshold,
-	TP_PROTO(struct ieee80211_local *local, u32 value, int ret),
+	TP_PROTO(struct ieee80211_local *local, u32 value),
 
-	TP_ARGS(local, value, ret),
+	TP_ARGS(local, value),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
 		__field(u32, value)
-		__field(int, ret)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
-		__entry->ret = ret;
 		__entry->value = value;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT " value:%d ret:%d",
-		LOCAL_PR_ARG, __entry->value, __entry->ret
+		LOCAL_PR_FMT " value:%d",
+		LOCAL_PR_ARG, __entry->value
 	)
 );
 
 TRACE_EVENT(drv_set_coverage_class,
-	TP_PROTO(struct ieee80211_local *local, u8 value, int ret),
+	TP_PROTO(struct ieee80211_local *local, u8 value),
 
-	TP_ARGS(local, value, ret),
+	TP_ARGS(local, value),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
 		__field(u8, value)
-		__field(int, ret)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
-		__entry->ret = ret;
 		__entry->value = value;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT " value:%d ret:%d",
-		LOCAL_PR_ARG, __entry->value, __entry->ret
+		LOCAL_PR_FMT " value:%d",
+		LOCAL_PR_ARG, __entry->value
 	)
 );
 
@@ -555,27 +604,25 @@ TRACE_EVENT(drv_sta_notify,
 TRACE_EVENT(drv_sta_add,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-		 struct ieee80211_sta *sta, int ret),
+		 struct ieee80211_sta *sta),
 
-	TP_ARGS(local, sdata, sta, ret),
+	TP_ARGS(local, sdata, sta),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
 		VIF_ENTRY
 		STA_ENTRY
-		__field(int, ret)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		STA_ASSIGN;
-		__entry->ret = ret;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " ret:%d",
-		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->ret
+		LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT,
+		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG
 	)
 );
 
@@ -606,10 +653,9 @@ TRACE_EVENT(drv_sta_remove,
 
 TRACE_EVENT(drv_conf_tx,
 	TP_PROTO(struct ieee80211_local *local, u16 queue,
-		 const struct ieee80211_tx_queue_params *params,
-		 int ret),
+		 const struct ieee80211_tx_queue_params *params),
 
-	TP_ARGS(local, queue, params, ret),
+	TP_ARGS(local, queue, params),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
@@ -618,13 +664,11 @@ TRACE_EVENT(drv_conf_tx,
 		__field(u16, cw_min)
 		__field(u16, cw_max)
 		__field(u8, aifs)
-		__field(int, ret)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
 		__entry->queue = queue;
-		__entry->ret = ret;
 		__entry->txop = params->txop;
 		__entry->cw_max = params->cw_max;
 		__entry->cw_min = params->cw_min;
@@ -632,29 +676,27 @@ TRACE_EVENT(drv_conf_tx,
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT " queue:%d ret:%d",
-		LOCAL_PR_ARG, __entry->queue, __entry->ret
+		LOCAL_PR_FMT " queue:%d",
+		LOCAL_PR_ARG, __entry->queue
 	)
 );
 
 TRACE_EVENT(drv_get_tsf,
-	TP_PROTO(struct ieee80211_local *local, u64 ret),
+	TP_PROTO(struct ieee80211_local *local),
 
-	TP_ARGS(local, ret),
+	TP_ARGS(local),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
-		__field(u64, ret)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
-		__entry->ret = ret;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT " ret:%llu",
-		LOCAL_PR_ARG, (unsigned long long)__entry->ret
+		LOCAL_PR_FMT,
+		LOCAL_PR_ARG
 	)
 );
 
@@ -698,23 +740,21 @@ TRACE_EVENT(drv_reset_tsf,
 );
 
 TRACE_EVENT(drv_tx_last_beacon,
-	TP_PROTO(struct ieee80211_local *local, int ret),
+	TP_PROTO(struct ieee80211_local *local),
 
-	TP_ARGS(local, ret),
+	TP_ARGS(local),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
-		__field(int, ret)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
-		__entry->ret = ret;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT " ret:%d",
-		LOCAL_PR_ARG, __entry->ret
+		LOCAL_PR_FMT,
+		LOCAL_PR_ARG
 	)
 );
 
@@ -723,9 +763,9 @@ TRACE_EVENT(drv_ampdu_action,
 		 struct ieee80211_sub_if_data *sdata,
 		 enum ieee80211_ampdu_mlme_action action,
 		 struct ieee80211_sta *sta, u16 tid,
-		 u16 *ssn, int ret),
+		 u16 *ssn),
 
-	TP_ARGS(local, sdata, action, sta, tid, ssn, ret),
+	TP_ARGS(local, sdata, action, sta, tid, ssn),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
@@ -733,7 +773,6 @@ TRACE_EVENT(drv_ampdu_action,
 		__field(u32, action)
 		__field(u16, tid)
 		__field(u16, ssn)
-		__field(int, ret)
 		VIF_ENTRY
 	),
 
@@ -741,15 +780,36 @@ TRACE_EVENT(drv_ampdu_action,
 		LOCAL_ASSIGN;
 		VIF_ASSIGN;
 		STA_ASSIGN;
-		__entry->ret = ret;
 		__entry->action = action;
 		__entry->tid = tid;
 		__entry->ssn = ssn ? *ssn : 0;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " action:%d tid:%d ret:%d",
-		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->action, __entry->tid, __entry->ret
+		LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " action:%d tid:%d",
+		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->action, __entry->tid
+	)
+);
+
+TRACE_EVENT(drv_get_survey,
+	TP_PROTO(struct ieee80211_local *local, int idx,
+		 struct survey_info *survey),
+
+	TP_ARGS(local, idx, survey),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(int, idx)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->idx = idx;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " idx:%d",
+		LOCAL_PR_ARG, __entry->idx
 	)
 );
 
@@ -851,25 +911,23 @@ TRACE_EVENT(api_start_tx_ba_cb,
 );
 
 TRACE_EVENT(api_stop_tx_ba_session,
-	TP_PROTO(struct ieee80211_sta *sta, u16 tid, u16 initiator),
+	TP_PROTO(struct ieee80211_sta *sta, u16 tid),
 
-	TP_ARGS(sta, tid, initiator),
+	TP_ARGS(sta, tid),
 
 	TP_STRUCT__entry(
 		STA_ENTRY
 		__field(u16, tid)
-		__field(u16, initiator)
 	),
 
 	TP_fast_assign(
 		STA_ASSIGN;
 		__entry->tid = tid;
-		__entry->initiator = initiator;
 	),
 
 	TP_printk(
-		STA_PR_FMT " tid:%d initiator:%d",
-		STA_PR_ARG, __entry->tid, __entry->initiator
+		STA_PR_FMT " tid:%d",
+		STA_PR_ARG, __entry->tid
 	)
 );
 
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 2ab106a0a49..75d679d75e6 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -6,7 +6,7 @@
  * Copyright 2005-2006, Devicescape Software, Inc.
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
- * Copyright 2007-2008, Intel Corporation
+ * Copyright 2007-2010, Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -29,7 +29,7 @@ void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_supported_band *sband,
 
 	memset(ht_cap, 0, sizeof(*ht_cap));
 
-	if (!ht_cap_ie)
+	if (!ht_cap_ie || !sband->ht_cap.ht_supported)
 		return;
 
 	ht_cap->ht_supported = true;
@@ -101,17 +101,57 @@ void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_supported_band *sband,
 		ht_cap->mcs.rx_mask[32/8] |= 1;
 }
 
-void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta)
+void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, bool tx)
 {
 	int i;
 
+	cancel_work_sync(&sta->ampdu_mlme.work);
+
 	for (i = 0; i <  STA_TID_NUM; i++) {
-		__ieee80211_stop_tx_ba_session(sta, i, WLAN_BACK_INITIATOR);
+		__ieee80211_stop_tx_ba_session(sta, i, WLAN_BACK_INITIATOR, tx);
 		__ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT,
-					       WLAN_REASON_QSTA_LEAVE_QBSS);
+					       WLAN_REASON_QSTA_LEAVE_QBSS, tx);
 	}
 }
 
+void ieee80211_ba_session_work(struct work_struct *work)
+{
+	struct sta_info *sta =
+		container_of(work, struct sta_info, ampdu_mlme.work);
+	struct tid_ampdu_tx *tid_tx;
+	int tid;
+
+	/*
+	 * When this flag is set, new sessions should be
+	 * blocked, and existing sessions will be torn
+	 * down by the code that set the flag, so this
+	 * need not run.
+	 */
+	if (test_sta_flags(sta, WLAN_STA_BLOCK_BA))
+		return;
+
+	mutex_lock(&sta->ampdu_mlme.mtx);
+	for (tid = 0; tid < STA_TID_NUM; tid++) {
+		if (test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired))
+			___ieee80211_stop_rx_ba_session(
+				sta, tid, WLAN_BACK_RECIPIENT,
+				WLAN_REASON_QSTA_TIMEOUT, true);
+
+		tid_tx = sta->ampdu_mlme.tid_tx[tid];
+		if (!tid_tx)
+			continue;
+
+		if (test_bit(HT_AGG_STATE_WANT_START, &tid_tx->state))
+			ieee80211_tx_ba_session_handle_start(sta, tid);
+		else if (test_and_clear_bit(HT_AGG_STATE_WANT_STOP,
+					    &tid_tx->state))
+			___ieee80211_stop_tx_ba_session(sta, tid,
+							WLAN_BACK_INITIATOR,
+							true);
+	}
+	mutex_unlock(&sta->ampdu_mlme.mtx);
+}
+
 void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
 			  const u8 *da, u16 tid,
 			  u16 initiator, u16 reason_code)
@@ -175,14 +215,11 @@ void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
 #endif /* CONFIG_MAC80211_HT_DEBUG */
 
 	if (initiator == WLAN_BACK_INITIATOR)
-		__ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_INITIATOR, 0);
-	else { /* WLAN_BACK_RECIPIENT */
-		spin_lock_bh(&sta->lock);
-		if (sta->ampdu_mlme.tid_state_tx[tid] & HT_ADDBA_REQUESTED_MSK)
-			___ieee80211_stop_tx_ba_session(sta, tid,
-							WLAN_BACK_RECIPIENT);
-		spin_unlock_bh(&sta->lock);
-	}
+		__ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_INITIATOR, 0,
+					       true);
+	else
+		__ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_RECIPIENT,
+					       true);
 }
 
 int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
@@ -231,3 +268,33 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
 
 	return 0;
 }
+
+void ieee80211_request_smps_work(struct work_struct *work)
+{
+	struct ieee80211_sub_if_data *sdata =
+		container_of(work, struct ieee80211_sub_if_data,
+			     u.mgd.request_smps_work);
+
+	mutex_lock(&sdata->u.mgd.mtx);
+	__ieee80211_request_smps(sdata, sdata->u.mgd.driver_smps_mode);
+	mutex_unlock(&sdata->u.mgd.mtx);
+}
+
+void ieee80211_request_smps(struct ieee80211_vif *vif,
+			    enum ieee80211_smps_mode smps_mode)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+	if (WARN_ON(vif->type != NL80211_IFTYPE_STATION))
+		return;
+
+	if (WARN_ON(smps_mode == IEEE80211_SMPS_OFF))
+		smps_mode = IEEE80211_SMPS_AUTOMATIC;
+
+	sdata->u.mgd.driver_smps_mode = smps_mode;
+
+	ieee80211_queue_work(&sdata->local->hw,
+			     &sdata->u.mgd.request_smps_work);
+}
+/* this might change ... don't want non-open drivers using it */
+EXPORT_SYMBOL_GPL(ieee80211_request_smps);
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index b2cc1fda6cf..239c4836a94 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -43,6 +43,8 @@ static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata,
 {
 	u16 auth_alg, auth_transaction, status_code;
 
+	lockdep_assert_held(&sdata->u.ibss.mtx);
+
 	if (len < 24 + 6)
 		return;
 
@@ -78,6 +80,8 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	u32 bss_change;
 	u8 supp_rates[IEEE80211_MAX_SUPP_RATES];
 
+	lockdep_assert_held(&ifibss->mtx);
+
 	/* Reset own TSF to allow time synchronization work. */
 	drv_reset_tsf(local);
 
@@ -169,14 +173,29 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 		memcpy(skb_put(skb, ifibss->ie_len),
 		       ifibss->ie, ifibss->ie_len);
 
+	if (local->hw.queues >= 4) {
+		pos = skb_put(skb, 9);
+		*pos++ = WLAN_EID_VENDOR_SPECIFIC;
+		*pos++ = 7; /* len */
+		*pos++ = 0x00; /* Microsoft OUI 00:50:F2 */
+		*pos++ = 0x50;
+		*pos++ = 0xf2;
+		*pos++ = 2; /* WME */
+		*pos++ = 0; /* WME info */
+		*pos++ = 1; /* WME ver */
+		*pos++ = 0; /* U-APSD no in use */
+	}
+
 	rcu_assign_pointer(ifibss->presp, skb);
 
 	sdata->vif.bss_conf.beacon_int = beacon_int;
+	sdata->vif.bss_conf.basic_rates = basic_rates;
 	bss_change = BSS_CHANGED_BEACON_INT;
 	bss_change |= ieee80211_reset_erp_info(sdata);
 	bss_change |= BSS_CHANGED_BSSID;
 	bss_change |= BSS_CHANGED_BEACON;
 	bss_change |= BSS_CHANGED_BEACON_ENABLED;
+	bss_change |= BSS_CHANGED_BASIC_RATES;
 	bss_change |= BSS_CHANGED_IBSS;
 	sdata->vif.bss_conf.ibss_joined = true;
 	ieee80211_bss_info_change_notify(sdata, bss_change);
@@ -203,6 +222,8 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
 	int i, j;
 	u16 beacon_int = cbss->beacon_interval;
 
+	lockdep_assert_held(&sdata->u.ibss.mtx);
+
 	if (beacon_int < 10)
 		beacon_int = 10;
 
@@ -258,37 +279,45 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 	if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
 		return;
 
-	if (sdata->vif.type == NL80211_IFTYPE_ADHOC && elems->supp_rates &&
+	if (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
 	    memcmp(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0) {
-		supp_rates = ieee80211_sta_get_rates(local, elems, band);
 
 		rcu_read_lock();
-
 		sta = sta_info_get(sdata, mgmt->sa);
-		if (sta) {
-			u32 prev_rates;
 
-			prev_rates = sta->sta.supp_rates[band];
-			/* make sure mandatory rates are always added */
-			sta->sta.supp_rates[band] = supp_rates |
-				ieee80211_mandatory_rates(local, band);
+		if (elems->supp_rates) {
+			supp_rates = ieee80211_sta_get_rates(local, elems,
+							     band);
+			if (sta) {
+				u32 prev_rates;
+
+				prev_rates = sta->sta.supp_rates[band];
+				/* make sure mandatory rates are always added */
+				sta->sta.supp_rates[band] = supp_rates |
+					ieee80211_mandatory_rates(local, band);
 
-			if (sta->sta.supp_rates[band] != prev_rates) {
+				if (sta->sta.supp_rates[band] != prev_rates) {
 #ifdef CONFIG_MAC80211_IBSS_DEBUG
-				printk(KERN_DEBUG "%s: updated supp_rates set "
-				    "for %pM based on beacon/probe_response "
-				    "(0x%x -> 0x%x)\n",
-				    sdata->name, sta->sta.addr,
-				    prev_rates, sta->sta.supp_rates[band]);
+					printk(KERN_DEBUG
+						"%s: updated supp_rates set "
+						"for %pM based on beacon"
+						"/probe_resp (0x%x -> 0x%x)\n",
+						sdata->name, sta->sta.addr,
+						prev_rates,
+						sta->sta.supp_rates[band]);
 #endif
-				rate_control_rate_init(sta);
-			}
-			rcu_read_unlock();
-		} else {
-			rcu_read_unlock();
-			ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa,
-					       supp_rates, GFP_KERNEL);
+					rate_control_rate_init(sta);
+				}
+			} else
+				sta = ieee80211_ibss_add_sta(sdata, mgmt->bssid,
+						mgmt->sa, supp_rates,
+						GFP_ATOMIC);
 		}
+
+		if (sta && elems->wmm_info)
+			set_sta_flags(sta, WLAN_STA_WME);
+
+		rcu_read_unlock();
 	}
 
 	bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems,
@@ -419,14 +448,15 @@ struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
 		return NULL;
 
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
-	printk(KERN_DEBUG "%s: Adding new IBSS station %pM (dev=%s)\n",
-	       wiphy_name(local->hw.wiphy), addr, sdata->name);
+	wiphy_debug(local->hw.wiphy, "Adding new IBSS station %pM (dev=%s)\n",
+		    addr, sdata->name);
 #endif
 
 	sta = sta_info_alloc(sdata, addr, gfp);
 	if (!sta)
 		return NULL;
 
+	sta->last_rx = jiffies;
 	set_sta_flags(sta, WLAN_STA_AUTHORIZED);
 
 	/* make sure mandatory rates are always added */
@@ -447,6 +477,8 @@ static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
 	int active = 0;
 	struct sta_info *sta;
 
+	lockdep_assert_held(&sdata->u.ibss.mtx);
+
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(sta, &local->sta_list, list) {
@@ -471,6 +503,8 @@ static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
 
+	lockdep_assert_held(&ifibss->mtx);
+
 	mod_timer(&ifibss->timer,
 		  round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL));
 
@@ -503,6 +537,8 @@ static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata)
 	u16 capability;
 	int i;
 
+	lockdep_assert_held(&ifibss->mtx);
+
 	if (ifibss->fixed_bssid) {
 		memcpy(bssid, ifibss->bssid, ETH_ALEN);
 	} else {
@@ -529,7 +565,7 @@ static void ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata)
 		sdata->drop_unencrypted = 0;
 
 	__ieee80211_sta_join_ibss(sdata, bssid, sdata->vif.bss_conf.beacon_int,
-				  ifibss->channel, 3, /* first two are basic */
+				  ifibss->channel, ifibss->basic_rates,
 				  capability, 0);
 }
 
@@ -547,6 +583,8 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata)
 	int active_ibss;
 	u16 capability;
 
+	lockdep_assert_held(&ifibss->mtx);
+
 	active_ibss = ieee80211_sta_active_ibss(sdata);
 #ifdef CONFIG_MAC80211_IBSS_DEBUG
 	printk(KERN_DEBUG "%s: sta_find_ibss (active_ibss=%d)\n",
@@ -635,6 +673,8 @@ static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_mgmt *resp;
 	u8 *pos, *end;
 
+	lockdep_assert_held(&ifibss->mtx);
+
 	if (ifibss->state != IEEE80211_IBSS_MLME_JOINED ||
 	    len < 24 + 2 || !ifibss->presp)
 		return;
@@ -727,8 +767,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, true);
 }
 
-static void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
-					  struct sk_buff *skb)
+void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+				   struct sk_buff *skb)
 {
 	struct ieee80211_rx_status *rx_status;
 	struct ieee80211_mgmt *mgmt;
@@ -738,6 +778,8 @@ static void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 	mgmt = (struct ieee80211_mgmt *) skb->data;
 	fc = le16_to_cpu(mgmt->frame_control);
 
+	mutex_lock(&sdata->u.ibss.mtx);
+
 	switch (fc & IEEE80211_FCTL_STYPE) {
 	case IEEE80211_STYPE_PROBE_REQ:
 		ieee80211_rx_mgmt_probe_req(sdata, mgmt, skb->len);
@@ -755,35 +797,22 @@ static void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 		break;
 	}
 
-	kfree_skb(skb);
+	mutex_unlock(&sdata->u.ibss.mtx);
 }
 
-static void ieee80211_ibss_work(struct work_struct *work)
+void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_sub_if_data *sdata =
-		container_of(work, struct ieee80211_sub_if_data, u.ibss.work);
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_ibss *ifibss;
-	struct sk_buff *skb;
-
-	if (WARN_ON(local->suspended))
-		return;
-
-	if (!ieee80211_sdata_running(sdata))
-		return;
-
-	if (local->scanning)
-		return;
-
-	if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_ADHOC))
-		return;
-	ifibss = &sdata->u.ibss;
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
 
-	while ((skb = skb_dequeue(&ifibss->skb_queue)))
-		ieee80211_ibss_rx_queued_mgmt(sdata, skb);
+	mutex_lock(&ifibss->mtx);
 
-	if (!test_and_clear_bit(IEEE80211_IBSS_REQ_RUN, &ifibss->request))
-		return;
+	/*
+	 * Work could be scheduled after scan or similar
+	 * when we aren't even joined (or trying) with a
+	 * network.
+	 */
+	if (!ifibss->ssid_len)
+		goto out;
 
 	switch (ifibss->state) {
 	case IEEE80211_IBSS_MLME_SEARCH:
@@ -796,6 +825,9 @@ static void ieee80211_ibss_work(struct work_struct *work)
 		WARN_ON(1);
 		break;
 	}
+
+ out:
+	mutex_unlock(&ifibss->mtx);
 }
 
 static void ieee80211_ibss_timer(unsigned long data)
@@ -810,8 +842,7 @@ static void ieee80211_ibss_timer(unsigned long data)
 		return;
 	}
 
-	set_bit(IEEE80211_IBSS_REQ_RUN, &ifibss->request);
-	ieee80211_queue_work(&local->hw, &ifibss->work);
+	ieee80211_queue_work(&local->hw, &sdata->work);
 }
 
 #ifdef CONFIG_PM
@@ -819,7 +850,6 @@ void ieee80211_ibss_quiesce(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
 
-	cancel_work_sync(&ifibss->work);
 	if (del_timer_sync(&ifibss->timer))
 		ifibss->timer_running = true;
 }
@@ -839,10 +869,9 @@ void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
 
-	INIT_WORK(&ifibss->work, ieee80211_ibss_work);
 	setup_timer(&ifibss->timer, ieee80211_ibss_timer,
 		    (unsigned long) sdata);
-	skb_queue_head_init(&ifibss->skb_queue);
+	mutex_init(&ifibss->mtx);
 }
 
 /* scan finished notification */
@@ -856,45 +885,28 @@ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local)
 			continue;
 		if (sdata->vif.type != NL80211_IFTYPE_ADHOC)
 			continue;
-		if (!sdata->u.ibss.ssid_len)
-			continue;
 		sdata->u.ibss.last_scan_completed = jiffies;
-		mod_timer(&sdata->u.ibss.timer, 0);
+		ieee80211_queue_work(&local->hw, &sdata->work);
 	}
 	mutex_unlock(&local->iflist_mtx);
 }
 
-ieee80211_rx_result
-ieee80211_ibss_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_mgmt *mgmt;
-	u16 fc;
-
-	if (skb->len < 24)
-		return RX_DROP_MONITOR;
-
-	mgmt = (struct ieee80211_mgmt *) skb->data;
-	fc = le16_to_cpu(mgmt->frame_control);
-
-	switch (fc & IEEE80211_FCTL_STYPE) {
-	case IEEE80211_STYPE_PROBE_RESP:
-	case IEEE80211_STYPE_BEACON:
-	case IEEE80211_STYPE_PROBE_REQ:
-	case IEEE80211_STYPE_AUTH:
-		skb_queue_tail(&sdata->u.ibss.skb_queue, skb);
-		ieee80211_queue_work(&local->hw, &sdata->u.ibss.work);
-		return RX_QUEUED;
-	}
-
-	return RX_DROP_MONITOR;
-}
-
 int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
 			struct cfg80211_ibss_params *params)
 {
 	struct sk_buff *skb;
 
+	skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom +
+			    36 /* bitrates */ +
+			    34 /* SSID */ +
+			    3  /* DS params */ +
+			    4  /* IBSS params */ +
+			    params->ie_len);
+	if (!skb)
+		return -ENOMEM;
+
+	mutex_lock(&sdata->u.ibss.mtx);
+
 	if (params->bssid) {
 		memcpy(sdata->u.ibss.bssid, params->bssid, ETH_ALEN);
 		sdata->u.ibss.fixed_bssid = true;
@@ -902,6 +914,7 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
 		sdata->u.ibss.fixed_bssid = false;
 
 	sdata->u.ibss.privacy = params->privacy;
+	sdata->u.ibss.basic_rates = params->basic_rates;
 
 	sdata->vif.bss_conf.beacon_int = params->beacon_interval;
 
@@ -922,34 +935,20 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
 			sdata->u.ibss.ie_len = params->ie_len;
 	}
 
-	skb = dev_alloc_skb(sdata->local->hw.extra_tx_headroom +
-			    36 /* bitrates */ +
-			    34 /* SSID */ +
-			    3  /* DS params */ +
-			    4  /* IBSS params */ +
-			    params->ie_len);
-	if (!skb)
-		return -ENOMEM;
-
 	sdata->u.ibss.skb = skb;
 	sdata->u.ibss.state = IEEE80211_IBSS_MLME_SEARCH;
 	sdata->u.ibss.ibss_join_req = jiffies;
 
 	memcpy(sdata->u.ibss.ssid, params->ssid, IEEE80211_MAX_SSID_LEN);
-
-	/*
-	 * The ssid_len setting below is used to see whether
-	 * we are active, and we need all other settings
-	 * before that may get visible.
-	 */
-	mb();
-
 	sdata->u.ibss.ssid_len = params->ssid_len;
 
+	mutex_unlock(&sdata->u.ibss.mtx);
+
+	mutex_lock(&sdata->local->mtx);
 	ieee80211_recalc_idle(sdata->local);
+	mutex_unlock(&sdata->local->mtx);
 
-	set_bit(IEEE80211_IBSS_REQ_RUN, &sdata->u.ibss.request);
-	ieee80211_queue_work(&sdata->local->hw, &sdata->u.ibss.work);
+	ieee80211_queue_work(&sdata->local->hw, &sdata->work);
 
 	return 0;
 }
@@ -957,11 +956,33 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
 int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata)
 {
 	struct sk_buff *skb;
+	struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
+	struct ieee80211_local *local = sdata->local;
+	struct cfg80211_bss *cbss;
+	u16 capability;
+	int active_ibss;
 
-	del_timer_sync(&sdata->u.ibss.timer);
-	clear_bit(IEEE80211_IBSS_REQ_RUN, &sdata->u.ibss.request);
-	cancel_work_sync(&sdata->u.ibss.work);
-	clear_bit(IEEE80211_IBSS_REQ_RUN, &sdata->u.ibss.request);
+	mutex_lock(&sdata->u.ibss.mtx);
+
+	active_ibss = ieee80211_sta_active_ibss(sdata);
+
+	if (!active_ibss && !is_zero_ether_addr(ifibss->bssid)) {
+		capability = WLAN_CAPABILITY_IBSS;
+
+		if (ifibss->privacy)
+			capability |= WLAN_CAPABILITY_PRIVACY;
+
+		cbss = cfg80211_get_bss(local->hw.wiphy, ifibss->channel,
+					ifibss->bssid, ifibss->ssid,
+					ifibss->ssid_len, WLAN_CAPABILITY_IBSS |
+					WLAN_CAPABILITY_PRIVACY,
+					capability);
+
+		if (cbss) {
+			cfg80211_unlink_bss(local->hw.wiphy, cbss);
+			cfg80211_put_bss(cbss);
+		}
+	}
 
 	sta_info_flush(sdata->local, sdata);
 
@@ -975,11 +996,17 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata)
 	synchronize_rcu();
 	kfree_skb(skb);
 
-	skb_queue_purge(&sdata->u.ibss.skb_queue);
+	skb_queue_purge(&sdata->skb_queue);
 	memset(sdata->u.ibss.bssid, 0, ETH_ALEN);
 	sdata->u.ibss.ssid_len = 0;
 
+	del_timer_sync(&sdata->u.ibss.timer);
+
+	mutex_unlock(&sdata->u.ibss.mtx);
+
+	mutex_lock(&local->mtx);
 	ieee80211_recalc_idle(sdata->local);
+	mutex_unlock(&local->mtx);
 
 	return 0;
 }
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 1a9e2da37a9..b80c3868992 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -50,12 +50,6 @@ struct ieee80211_local;
  * increased memory use (about 2 kB of RAM per entry). */
 #define IEEE80211_FRAGMENT_MAX 4
 
-/*
- * Time after which we ignore scan results and no longer report/use
- * them in any way.
- */
-#define IEEE80211_SCAN_RESULT_EXPIRE (10 * HZ)
-
 #define TU_TO_EXP_TIME(x)	(jiffies + usecs_to_jiffies((x) * 1024))
 
 #define IEEE80211_DEFAULT_UAPSD_QUEUES \
@@ -165,12 +159,37 @@ typedef unsigned __bitwise__ ieee80211_rx_result;
 #define RX_DROP_MONITOR		((__force ieee80211_rx_result) 2u)
 #define RX_QUEUED		((__force ieee80211_rx_result) 3u)
 
-#define IEEE80211_RX_IN_SCAN		BIT(0)
-/* frame is destined to interface currently processed (incl. multicast frames) */
-#define IEEE80211_RX_RA_MATCH		BIT(1)
-#define IEEE80211_RX_AMSDU		BIT(2)
-#define IEEE80211_RX_FRAGMENTED		BIT(3)
-/* only add flags here that do not change with subframes of an aMPDU */
+/**
+ * enum ieee80211_packet_rx_flags - packet RX flags
+ * @IEEE80211_RX_RA_MATCH: frame is destined to interface currently processed
+ *	(incl. multicast frames)
+ * @IEEE80211_RX_IN_SCAN: received while scanning
+ * @IEEE80211_RX_FRAGMENTED: fragmented frame
+ * @IEEE80211_RX_AMSDU: a-MSDU packet
+ * @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed
+ *
+ * These are per-frame flags that are attached to a frame in the
+ * @rx_flags field of &struct ieee80211_rx_status.
+ */
+enum ieee80211_packet_rx_flags {
+	IEEE80211_RX_IN_SCAN			= BIT(0),
+	IEEE80211_RX_RA_MATCH			= BIT(1),
+	IEEE80211_RX_FRAGMENTED			= BIT(2),
+	IEEE80211_RX_AMSDU			= BIT(3),
+	IEEE80211_RX_MALFORMED_ACTION_FRM	= BIT(4),
+};
+
+/**
+ * enum ieee80211_rx_flags - RX data flags
+ *
+ * @IEEE80211_RX_CMNTR: received on cooked monitor already
+ *
+ * These flags are used across handling multiple interfaces
+ * for a single frame.
+ */
+enum ieee80211_rx_flags {
+	IEEE80211_RX_CMNTR		= BIT(0),
+};
 
 struct ieee80211_rx_data {
 	struct sk_buff *skb;
@@ -238,6 +257,7 @@ enum ieee80211_work_type {
 	IEEE80211_WORK_ABORT,
 	IEEE80211_WORK_DIRECT_PROBE,
 	IEEE80211_WORK_AUTH,
+	IEEE80211_WORK_ASSOC_BEACON_WAIT,
 	IEEE80211_WORK_ASSOC,
 	IEEE80211_WORK_REMAIN_ON_CHANNEL,
 };
@@ -325,7 +345,6 @@ struct ieee80211_if_managed {
 	struct timer_list conn_mon_timer;
 	struct timer_list bcn_mon_timer;
 	struct timer_list chswitch_timer;
-	struct work_struct work;
 	struct work_struct monitor_work;
 	struct work_struct chswitch_work;
 	struct work_struct beacon_connection_loss_work;
@@ -340,15 +359,17 @@ struct ieee80211_if_managed {
 
 	u16 aid;
 
-	struct sk_buff_head skb_queue;
-
 	unsigned long timers_running; /* used for quiesce/restart */
 	bool powersave; /* powersave requested for this iface */
 	enum ieee80211_smps_mode req_smps, /* requested smps mode */
-				 ap_smps; /* smps mode AP thinks we're in */
+				 ap_smps, /* smps mode AP thinks we're in */
+				 driver_smps_mode; /* smps mode request */
+
+	struct work_struct request_smps_work;
 
 	unsigned int flags;
 
+	bool beacon_crc_valid;
 	u32 beacon_crc;
 
 	enum {
@@ -373,6 +394,13 @@ struct ieee80211_if_managed {
 	int ave_beacon_signal;
 
 	/*
+	 * Number of Beacon frames used in ave_beacon_signal. This can be used
+	 * to avoid generating less reliable cqm events that would be based
+	 * only on couple of received frames.
+	 */
+	unsigned int count_beacon_signal;
+
+	/*
 	 * Last Beacon frame signal strength average (ave_beacon_signal / 16)
 	 * that triggered a cqm event. 0 indicates that no event has been
 	 * generated for the current association.
@@ -380,19 +408,15 @@ struct ieee80211_if_managed {
 	int last_cqm_event_signal;
 };
 
-enum ieee80211_ibss_request {
-	IEEE80211_IBSS_REQ_RUN	= 0,
-};
-
 struct ieee80211_if_ibss {
 	struct timer_list timer;
-	struct work_struct work;
 
-	struct sk_buff_head skb_queue;
+	struct mutex mtx;
 
-	unsigned long request;
 	unsigned long last_scan_completed;
 
+	u32 basic_rates;
+
 	bool timer_running;
 
 	bool fixed_bssid;
@@ -416,11 +440,9 @@ struct ieee80211_if_ibss {
 };
 
 struct ieee80211_if_mesh {
-	struct work_struct work;
 	struct timer_list housekeeping_timer;
 	struct timer_list mesh_path_timer;
 	struct timer_list mesh_path_root_timer;
-	struct sk_buff_head skb_queue;
 
 	unsigned long timers_running;
 
@@ -482,6 +504,19 @@ enum ieee80211_sub_if_data_flags {
 	IEEE80211_SDATA_DONT_BRIDGE_PACKETS	= BIT(3),
 };
 
+/**
+ * enum ieee80211_sdata_state_bits - virtual interface state bits
+ * @SDATA_STATE_RUNNING: virtual interface is up & running; this
+ *	mirrors netif_running() but is separate for interface type
+ *	change handling while the interface is up
+ * @SDATA_STATE_OFFCHANNEL: This interface is currently in offchannel
+ *	mode, so queues are stopped
+ */
+enum ieee80211_sdata_state_bits {
+	SDATA_STATE_RUNNING,
+	SDATA_STATE_OFFCHANNEL,
+};
+
 struct ieee80211_sub_if_data {
 	struct list_head list;
 
@@ -495,6 +530,8 @@ struct ieee80211_sub_if_data {
 
 	unsigned int flags;
 
+	unsigned long state;
+
 	int drop_unencrypted;
 
 	char name[IFNAMSIZ];
@@ -505,17 +542,25 @@ struct ieee80211_sub_if_data {
 	 */
 	bool ht_opmode_valid;
 
+	/* to detect idle changes */
+	bool old_idle;
+
 	/* Fragment table for host-based reassembly */
 	struct ieee80211_fragment_entry	fragments[IEEE80211_FRAGMENT_MAX];
 	unsigned int fragment_next;
 
-#define NUM_DEFAULT_KEYS 4
-#define NUM_DEFAULT_MGMT_KEYS 2
 	struct ieee80211_key *keys[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS];
 	struct ieee80211_key *default_key;
 	struct ieee80211_key *default_mgmt_key;
 
 	u16 sequence_number;
+	__be16 control_port_protocol;
+	bool control_port_no_encrypt;
+
+	struct work_struct work;
+	struct sk_buff_head skb_queue;
+
+	bool arp_filter_state;
 
 	/*
 	 * AP this belongs to: self in AP mode and
@@ -542,6 +587,7 @@ struct ieee80211_sub_if_data {
 #ifdef CONFIG_MAC80211_DEBUGFS
 	struct {
 		struct dentry *dir;
+		struct dentry *subdir_stations;
 		struct dentry *default_key;
 		struct dentry *default_mgmt_key;
 	} debugfs;
@@ -569,11 +615,15 @@ ieee80211_sdata_set_mesh_id(struct ieee80211_sub_if_data *sdata,
 #endif
 }
 
+enum sdata_queue_type {
+	IEEE80211_SDATA_QUEUE_TYPE_FRAME	= 0,
+	IEEE80211_SDATA_QUEUE_AGG_START		= 1,
+	IEEE80211_SDATA_QUEUE_AGG_STOP		= 2,
+};
+
 enum {
 	IEEE80211_RX_MSG	= 1,
 	IEEE80211_TX_STATUS_MSG	= 2,
-	IEEE80211_DELBA_MSG	= 3,
-	IEEE80211_ADDBA_MSG	= 4,
 };
 
 enum queue_stop_reason {
@@ -594,11 +644,17 @@ enum queue_stop_reason {
  *	determine if we are on the operating channel or not
  * @SCAN_OFF_CHANNEL: We're off our operating channel for scanning,
  *	gets only set in conjunction with SCAN_SW_SCANNING
+ * @SCAN_COMPLETED: Set for our scan work function when the driver reported
+ *	that the scan completed.
+ * @SCAN_ABORTED: Set for our scan work function when the driver reported
+ *	a scan complete for an aborted scan.
  */
 enum {
 	SCAN_SW_SCANNING,
 	SCAN_HW_SCANNING,
 	SCAN_OFF_CHANNEL,
+	SCAN_COMPLETED,
+	SCAN_ABORTED,
 };
 
 /**
@@ -633,7 +689,6 @@ struct ieee80211_local {
 	/*
 	 * work stuff, potentially off-channel (in the future)
 	 */
-	struct mutex work_mtx;
 	struct list_head work_list;
 	struct timer_list work_timer;
 	struct work_struct work_work;
@@ -652,9 +707,13 @@ struct ieee80211_local {
 	int open_count;
 	int monitors, cooked_mntrs;
 	/* number of interfaces with corresponding FIF_ flags */
-	int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll;
+	int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll,
+	    fif_probe_req;
+	int probe_req_reg;
 	unsigned int filter_flags; /* FIF_* */
 
+	bool wiphy_ciphers_allocated;
+
 	/* protects the aggregated multicast list and filter calls */
 	spinlock_t filter_lock;
 
@@ -724,13 +783,7 @@ struct ieee80211_local {
 	struct sk_buff_head pending[IEEE80211_MAX_QUEUES];
 	struct tasklet_struct tx_pending_tasklet;
 
-	/*
-	 * This lock is used to prevent concurrent A-MPDU
-	 * session start/stop processing, this thus also
-	 * synchronises the ->ampdu_action() callback to
-	 * drivers and limits it to one at a time.
-	 */
-	spinlock_t ampdu_lock;
+	atomic_t agg_queue_stop[IEEE80211_MAX_QUEUES];
 
 	/* number of interfaces with corresponding IFF_ flags */
 	atomic_t iff_allmultis, iff_promiscs;
@@ -746,14 +799,15 @@ struct ieee80211_local {
 	struct mutex iflist_mtx;
 
 	/*
-	 * Key lock, protects sdata's key_list and sta_info's
+	 * Key mutex, protects sdata's key_list and sta_info's
 	 * key pointers (write access, they're RCU.)
 	 */
-	spinlock_t key_lock;
+	struct mutex key_mtx;
 
+	/* mutex for scan and work locking */
+	struct mutex mtx;
 
 	/* Scanning and BSS list */
-	struct mutex scan_mtx;
 	unsigned long scanning;
 	struct cfg80211_ssid scan_ssid;
 	struct cfg80211_scan_request *int_scan_req;
@@ -851,6 +905,15 @@ struct ieee80211_local {
 	struct work_struct dynamic_ps_disable_work;
 	struct timer_list dynamic_ps_timer;
 	struct notifier_block network_latency_notifier;
+	struct notifier_block ifa_notifier;
+
+	/*
+	 * The dynamic ps timeout configured from user space via WEXT -
+	 * this will override whatever chosen by mac80211 internally.
+	 */
+	int dynamic_ps_forced_timeout;
+	int dynamic_ps_user_timeout;
+	bool disable_dynamic_ps;
 
 	int user_power_level; /* in dBm */
 	int power_constr_level; /* in dBm */
@@ -862,10 +925,14 @@ struct ieee80211_local {
 #ifdef CONFIG_MAC80211_DEBUGFS
 	struct local_debugfsdentries {
 		struct dentry *rcdir;
-		struct dentry *stations;
 		struct dentry *keys;
 	} debugfs;
 #endif
+
+	/* dummy netdev for use w/ NAPI */
+	struct net_device napi_dev;
+
+	struct napi_struct napi;
 };
 
 static inline struct ieee80211_sub_if_data *
@@ -874,9 +941,8 @@ IEEE80211_DEV_TO_SUB_IF(struct net_device *dev)
 	return netdev_priv(dev);
 }
 
-/* this struct represents 802.11n's RA/TID combination along with our vif */
+/* this struct represents 802.11n's RA/TID combination */
 struct ieee80211_ra_tid {
-	struct ieee80211_vif *vif;
 	u8 ra[ETH_ALEN];
 	u16 tid;
 };
@@ -985,29 +1051,27 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
 int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata,
 			   struct cfg80211_disassoc_request *req,
 			   void *cookie);
-int ieee80211_mgd_action(struct ieee80211_sub_if_data *sdata,
-			 struct ieee80211_channel *chan,
-			 enum nl80211_channel_type channel_type,
-			 const u8 *buf, size_t len, u64 *cookie);
-ieee80211_rx_result ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata,
-					  struct sk_buff *skb);
 void ieee80211_send_pspoll(struct ieee80211_local *local,
 			   struct ieee80211_sub_if_data *sdata);
 void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency);
 int ieee80211_max_network_latency(struct notifier_block *nb,
 				  unsigned long data, void *dummy);
+int ieee80211_set_arp_filter(struct ieee80211_sub_if_data *sdata);
 void ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 				      struct ieee80211_channel_sw_ie *sw_elem,
 				      struct ieee80211_bss *bss,
 				      u64 timestamp);
 void ieee80211_sta_quiesce(struct ieee80211_sub_if_data *sdata);
 void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+				  struct sk_buff *skb);
+void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata);
+void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata);
 
 /* IBSS code */
 void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local);
 void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata);
-ieee80211_rx_result
-ieee80211_ibss_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb);
 struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
 					u8 *bssid, u8 *addr, u32 supp_rates,
 					gfp_t gfp);
@@ -1016,6 +1080,14 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
 int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata);
 void ieee80211_ibss_quiesce(struct ieee80211_sub_if_data *sdata);
 void ieee80211_ibss_restart(struct ieee80211_sub_if_data *sdata);
+void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata);
+void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+				   struct sk_buff *skb);
+
+/* mesh code */
+void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata);
+void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+				   struct sk_buff *skb);
 
 /* scan/BSS handling */
 void ieee80211_scan_work(struct work_struct *work);
@@ -1061,10 +1133,12 @@ void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata);
 void ieee80211_remove_interfaces(struct ieee80211_local *local);
 u32 __ieee80211_recalc_idle(struct ieee80211_local *local);
 void ieee80211_recalc_idle(struct ieee80211_local *local);
+void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
+				    const int offset);
 
 static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata)
 {
-	return netif_running(sdata->dev);
+	return test_bit(SDATA_STATE_RUNNING, &sdata->state);
 }
 
 /* tx handling */
@@ -1084,7 +1158,7 @@ struct ieee80211_tx_status_rtap_hdr {
 	u8 padding_for_rate;
 	__le16 tx_flags;
 	u8 data_retries;
-} __attribute__ ((packed));
+} __packed;
 
 
 /* HT */
@@ -1098,10 +1172,13 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
 int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
 			       enum ieee80211_smps_mode smps, const u8 *da,
 			       const u8 *bssid);
+void ieee80211_request_smps_work(struct work_struct *work);
 
+void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
+				     u16 initiator, u16 reason, bool stop);
 void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
-				    u16 initiator, u16 reason);
-void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta);
+				    u16 initiator, u16 reason, bool stop);
+void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, bool tx);
 void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
 			     struct sta_info *sta,
 			     struct ieee80211_mgmt *mgmt, size_t len);
@@ -1115,9 +1192,16 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
 				     size_t len);
 
 int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
-				   enum ieee80211_back_parties initiator);
+				   enum ieee80211_back_parties initiator,
+				   bool tx);
 int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
-				    enum ieee80211_back_parties initiator);
+				    enum ieee80211_back_parties initiator,
+				    bool tx);
+void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid);
+void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid);
+void ieee80211_ba_session_work(struct work_struct *work);
+void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid);
+void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid);
 
 /* Spectrum management */
 void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
@@ -1133,6 +1217,12 @@ int __ieee80211_suspend(struct ieee80211_hw *hw);
 
 static inline int __ieee80211_resume(struct ieee80211_hw *hw)
 {
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	WARN(test_bit(SCAN_HW_SCANNING, &local->scanning),
+		"%s: resume with hardware scan still in progress\n",
+		wiphy_name(hw->wiphy));
+
 	return ieee80211_reconfig(hw_to_local(hw));
 }
 #else
@@ -1195,7 +1285,8 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 			 const u8 *key, u8 key_len, u8 key_idx);
 int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
 			     const u8 *ie, size_t ie_len,
-			     enum ieee80211_band band);
+			     enum ieee80211_band band, u32 rate_mask,
+			     u8 channel);
 void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 			      const u8 *ssid, size_t ssid_len,
 			      const u8 *ie, size_t ie_len);
@@ -1208,8 +1299,7 @@ u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
 			    enum ieee80211_band band);
 int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata,
 			     enum ieee80211_smps_mode smps_mode);
-void ieee80211_recalc_smps(struct ieee80211_local *local,
-			   struct ieee80211_sub_if_data *forsdata);
+void ieee80211_recalc_smps(struct ieee80211_local *local);
 
 size_t ieee80211_ie_split(const u8 *ies, size_t ielen,
 			  const u8 *ids, int n_ids, size_t offset);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 50deb017fd6..f9163b12c7f 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -24,6 +24,7 @@
 #include "led.h"
 #include "driver-ops.h"
 #include "wme.h"
+#include "rate.h"
 
 /**
  * DOC: Interface list locking
@@ -94,21 +95,14 @@ static inline int identical_mac_addr_allowed(int type1, int type2)
 			 type2 == NL80211_IFTYPE_AP_VLAN));
 }
 
-static int ieee80211_open(struct net_device *dev)
+static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata,
+					    enum nl80211_iftype iftype)
 {
-	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-	struct ieee80211_sub_if_data *nsdata;
 	struct ieee80211_local *local = sdata->local;
-	struct sta_info *sta;
-	u32 changed = 0;
-	int res;
-	u32 hw_reconf_flags = 0;
-	u8 null_addr[ETH_ALEN] = {0};
+	struct ieee80211_sub_if_data *nsdata;
+	struct net_device *dev = sdata->dev;
 
-	/* fail early if user set an invalid address */
-	if (compare_ether_addr(dev->dev_addr, null_addr) &&
-	    !is_valid_ether_addr(dev->dev_addr))
-		return -EADDRNOTAVAIL;
+	ASSERT_RTNL();
 
 	/* we hold the RTNL here so can safely walk the list */
 	list_for_each_entry(nsdata, &local->interfaces, list) {
@@ -125,7 +119,7 @@ static int ieee80211_open(struct net_device *dev)
 			 * belonging to the same hardware. Then, however, we're
 			 * faced with having to adopt two different TSF timers...
 			 */
-			if (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
+			if (iftype == NL80211_IFTYPE_ADHOC &&
 			    nsdata->vif.type == NL80211_IFTYPE_ADHOC)
 				return -EBUSY;
 
@@ -139,19 +133,56 @@ static int ieee80211_open(struct net_device *dev)
 			/*
 			 * check whether it may have the same address
 			 */
-			if (!identical_mac_addr_allowed(sdata->vif.type,
+			if (!identical_mac_addr_allowed(iftype,
 							nsdata->vif.type))
 				return -ENOTUNIQ;
 
 			/*
 			 * can only add VLANs to enabled APs
 			 */
-			if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+			if (iftype == NL80211_IFTYPE_AP_VLAN &&
 			    nsdata->vif.type == NL80211_IFTYPE_AP)
 				sdata->bss = &nsdata->u.ap;
 		}
 	}
 
+	return 0;
+}
+
+void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata,
+				    const int offset)
+{
+	struct ieee80211_local *local = sdata->local;
+	u32 flags = sdata->u.mntr_flags;
+
+#define ADJUST(_f, _s)	do {					\
+	if (flags & MONITOR_FLAG_##_f)				\
+		local->fif_##_s += offset;			\
+	} while (0)
+
+	ADJUST(FCSFAIL, fcsfail);
+	ADJUST(PLCPFAIL, plcpfail);
+	ADJUST(CONTROL, control);
+	ADJUST(CONTROL, pspoll);
+	ADJUST(OTHER_BSS, other_bss);
+
+#undef ADJUST
+}
+
+/*
+ * NOTE: Be very careful when changing this function, it must NOT return
+ * an error on interface type changes that have been pre-checked, so most
+ * checks should be in ieee80211_check_concurrent_iface.
+ */
+static int ieee80211_do_open(struct net_device *dev, bool coming_up)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+	u32 changed = 0;
+	int res;
+	u32 hw_reconf_flags = 0;
+
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_WDS:
 		if (!is_valid_ether_addr(sdata->u.wds.remote_addr))
@@ -177,7 +208,9 @@ static int ieee80211_open(struct net_device *dev)
 		/* no special treatment */
 		break;
 	case NL80211_IFTYPE_UNSPECIFIED:
-	case __NL80211_IFTYPE_AFTER_LAST:
+	case NUM_NL80211_IFTYPES:
+	case NL80211_IFTYPE_P2P_CLIENT:
+	case NL80211_IFTYPE_P2P_GO:
 		/* cannot happen */
 		WARN_ON(1);
 		break;
@@ -187,39 +220,30 @@ static int ieee80211_open(struct net_device *dev)
 		res = drv_start(local);
 		if (res)
 			goto err_del_bss;
+		if (local->ops->napi_poll)
+			napi_enable(&local->napi);
 		/* we're brought up, everything changes */
 		hw_reconf_flags = ~0;
 		ieee80211_led_radio(local, true);
 	}
 
 	/*
-	 * Check all interfaces and copy the hopefully now-present
-	 * MAC address to those that have the special null one.
+	 * Copy the hopefully now-present MAC address to
+	 * this interface, if it has the special null one.
 	 */
-	list_for_each_entry(nsdata, &local->interfaces, list) {
-		struct net_device *ndev = nsdata->dev;
-
-		/*
-		 * No need to check running since we do not allow
-		 * it to start up with this invalid address.
-		 */
-		if (compare_ether_addr(null_addr, ndev->dev_addr) == 0) {
-			memcpy(ndev->dev_addr,
-			       local->hw.wiphy->perm_addr,
-			       ETH_ALEN);
-			memcpy(ndev->perm_addr, ndev->dev_addr, ETH_ALEN);
+	if (is_zero_ether_addr(dev->dev_addr)) {
+		memcpy(dev->dev_addr,
+		       local->hw.wiphy->perm_addr,
+		       ETH_ALEN);
+		memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN);
+
+		if (!is_valid_ether_addr(dev->dev_addr)) {
+			if (!local->open_count)
+				drv_stop(local);
+			return -EADDRNOTAVAIL;
 		}
 	}
 
-	/*
-	 * Validate the MAC address for this device.
-	 */
-	if (!is_valid_ether_addr(dev->dev_addr)) {
-		if (!local->open_count)
-			drv_stop(local);
-		return -EADDRNOTAVAIL;
-	}
-
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_AP_VLAN:
 		/* no need to tell driver */
@@ -237,23 +261,17 @@ static int ieee80211_open(struct net_device *dev)
 			hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR;
 		}
 
-		if (sdata->u.mntr_flags & MONITOR_FLAG_FCSFAIL)
-			local->fif_fcsfail++;
-		if (sdata->u.mntr_flags & MONITOR_FLAG_PLCPFAIL)
-			local->fif_plcpfail++;
-		if (sdata->u.mntr_flags & MONITOR_FLAG_CONTROL) {
-			local->fif_control++;
-			local->fif_pspoll++;
-		}
-		if (sdata->u.mntr_flags & MONITOR_FLAG_OTHER_BSS)
-			local->fif_other_bss++;
-
+		ieee80211_adjust_monitor_flags(sdata, 1);
 		ieee80211_configure_filter(local);
+
+		netif_carrier_on(dev);
 		break;
 	default:
-		res = drv_add_interface(local, &sdata->vif);
-		if (res)
-			goto err_stop;
+		if (coming_up) {
+			res = drv_add_interface(local, &sdata->vif);
+			if (res)
+				goto err_stop;
+		}
 
 		if (ieee80211_vif_is_mesh(&sdata->vif)) {
 			local->fif_other_bss++;
@@ -262,13 +280,15 @@ static int ieee80211_open(struct net_device *dev)
 			ieee80211_start_mesh(sdata);
 		} else if (sdata->vif.type == NL80211_IFTYPE_AP) {
 			local->fif_pspoll++;
+			local->fif_probe_req++;
 
 			ieee80211_configure_filter(local);
+		} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+			local->fif_probe_req++;
 		}
 
 		changed |= ieee80211_reset_erp_info(sdata);
 		ieee80211_bss_info_change_notify(sdata, changed);
-		ieee80211_enable_keys(sdata);
 
 		if (sdata->vif.type == NL80211_IFTYPE_STATION)
 			netif_carrier_off(dev);
@@ -276,6 +296,8 @@ static int ieee80211_open(struct net_device *dev)
 			netif_carrier_on(dev);
 	}
 
+	set_bit(SDATA_STATE_RUNNING, &sdata->state);
+
 	if (sdata->vif.type == NL80211_IFTYPE_WDS) {
 		/* Create STA entry for the WDS peer */
 		sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr,
@@ -293,6 +315,8 @@ static int ieee80211_open(struct net_device *dev)
 			/* STA has been freed */
 			goto err_del_interface;
 		}
+
+		rate_control_rate_init(sta);
 	}
 
 	/*
@@ -306,9 +330,13 @@ static int ieee80211_open(struct net_device *dev)
 	if (sdata->flags & IEEE80211_SDATA_PROMISC)
 		atomic_inc(&local->iff_promiscs);
 
+	mutex_lock(&local->mtx);
 	hw_reconf_flags |= __ieee80211_recalc_idle(local);
+	mutex_unlock(&local->mtx);
+
+	if (coming_up)
+		local->open_count++;
 
-	local->open_count++;
 	if (hw_reconf_flags) {
 		ieee80211_hw_config(local, hw_reconf_flags);
 		/*
@@ -321,15 +349,6 @@ static int ieee80211_open(struct net_device *dev)
 
 	ieee80211_recalc_ps(local, -1);
 
-	/*
-	 * ieee80211_sta_work is disabled while network interface
-	 * is down. Therefore, some configuration changes may not
-	 * yet be effective. Trigger execution of ieee80211_sta_work
-	 * to fix this.
-	 */
-	if (sdata->vif.type == NL80211_IFTYPE_STATION)
-		ieee80211_queue_work(&local->hw, &sdata->u.mgd.work);
-
 	netif_tx_start_all_queues(dev);
 
 	return 0;
@@ -342,23 +361,42 @@ static int ieee80211_open(struct net_device *dev)
 	sdata->bss = NULL;
 	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 		list_del(&sdata->u.vlan.list);
+	clear_bit(SDATA_STATE_RUNNING, &sdata->state);
 	return res;
 }
 
-static int ieee80211_stop(struct net_device *dev)
+static int ieee80211_open(struct net_device *dev)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	int err;
+
+	/* fail early if user set an invalid address */
+	if (!is_zero_ether_addr(dev->dev_addr) &&
+	    !is_valid_ether_addr(dev->dev_addr))
+		return -EADDRNOTAVAIL;
+
+	err = ieee80211_check_concurrent_iface(sdata, sdata->vif.type);
+	if (err)
+		return err;
+
+	return ieee80211_do_open(dev, true);
+}
+
+static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
+			      bool going_down)
+{
 	struct ieee80211_local *local = sdata->local;
-	struct sta_info *sta;
 	unsigned long flags;
 	struct sk_buff *skb, *tmp;
 	u32 hw_reconf_flags = 0;
 	int i;
 
+	clear_bit(SDATA_STATE_RUNNING, &sdata->state);
+
 	/*
 	 * Stop TX on this interface first.
 	 */
-	netif_tx_stop_all_queues(dev);
+	netif_tx_stop_all_queues(sdata->dev);
 
 	/*
 	 * Purge work for this interface.
@@ -366,18 +404,6 @@ static int ieee80211_stop(struct net_device *dev)
 	ieee80211_work_purge(sdata);
 
 	/*
-	 * Now delete all active aggregation sessions.
-	 */
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(sta, &local->sta_list, list) {
-		if (sta->sdata == sdata)
-			ieee80211_sta_tear_down_BA_sessions(sta);
-	}
-
-	rcu_read_unlock();
-
-	/*
 	 * Remove all stations associated with this interface.
 	 *
 	 * This must be done before calling ops->remove_interface()
@@ -387,12 +413,9 @@ static int ieee80211_stop(struct net_device *dev)
 	 * (because if we remove a STA after ops->remove_interface()
 	 * the driver will have removed the vif info already!)
 	 *
-	 * We could relax this and only unlink the stations from the
-	 * hash table and list but keep them on a per-sdata list that
-	 * will be inserted back again when the interface is brought
-	 * up again, but I don't currently see a use case for that,
-	 * except with WDS which gets a STA entry created when it is
-	 * brought up.
+	 * This is relevant only in AP, WDS and mesh modes, since in
+	 * all other modes we've already removed all stations when
+	 * disconnecting etc.
 	 */
 	sta_info_flush(local, sdata);
 
@@ -408,14 +431,19 @@ static int ieee80211_stop(struct net_device *dev)
 	if (sdata->flags & IEEE80211_SDATA_PROMISC)
 		atomic_dec(&local->iff_promiscs);
 
-	if (sdata->vif.type == NL80211_IFTYPE_AP)
+	if (sdata->vif.type == NL80211_IFTYPE_AP) {
 		local->fif_pspoll--;
+		local->fif_probe_req--;
+	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
+		local->fif_probe_req--;
+	}
 
-	netif_addr_lock_bh(dev);
+	netif_addr_lock_bh(sdata->dev);
 	spin_lock_bh(&local->filter_lock);
-	__hw_addr_unsync(&local->mc_list, &dev->mc, dev->addr_len);
+	__hw_addr_unsync(&local->mc_list, &sdata->dev->mc,
+			 sdata->dev->addr_len);
 	spin_unlock_bh(&local->filter_lock);
-	netif_addr_unlock_bh(dev);
+	netif_addr_unlock_bh(sdata->dev);
 
 	ieee80211_configure_filter(local);
 
@@ -427,11 +455,21 @@ static int ieee80211_stop(struct net_device *dev)
 		struct ieee80211_sub_if_data *vlan, *tmpsdata;
 		struct beacon_data *old_beacon = sdata->u.ap.beacon;
 
+		/* sdata_running will return false, so this will disable */
+		ieee80211_bss_info_change_notify(sdata,
+						 BSS_CHANGED_BEACON_ENABLED);
+
 		/* remove beacon */
 		rcu_assign_pointer(sdata->u.ap.beacon, NULL);
 		synchronize_rcu();
 		kfree(old_beacon);
 
+		/* free all potentially still buffered bcast frames */
+		while ((skb = skb_dequeue(&sdata->u.ap.ps_bc_buf))) {
+			local->total_ps_buffered--;
+			dev_kfree_skb(skb);
+		}
+
 		/* down all dependent devices, that is VLANs */
 		list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans,
 					 u.vlan.list)
@@ -439,7 +477,8 @@ static int ieee80211_stop(struct net_device *dev)
 		WARN_ON(!list_empty(&sdata->u.ap.vlans));
 	}
 
-	local->open_count--;
+	if (going_down)
+		local->open_count--;
 
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_AP_VLAN:
@@ -458,53 +497,9 @@ static int ieee80211_stop(struct net_device *dev)
 			hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR;
 		}
 
-		if (sdata->u.mntr_flags & MONITOR_FLAG_FCSFAIL)
-			local->fif_fcsfail--;
-		if (sdata->u.mntr_flags & MONITOR_FLAG_PLCPFAIL)
-			local->fif_plcpfail--;
-		if (sdata->u.mntr_flags & MONITOR_FLAG_CONTROL) {
-			local->fif_pspoll--;
-			local->fif_control--;
-		}
-		if (sdata->u.mntr_flags & MONITOR_FLAG_OTHER_BSS)
-			local->fif_other_bss--;
-
+		ieee80211_adjust_monitor_flags(sdata, -1);
 		ieee80211_configure_filter(local);
 		break;
-	case NL80211_IFTYPE_STATION:
-		del_timer_sync(&sdata->u.mgd.chswitch_timer);
-		del_timer_sync(&sdata->u.mgd.timer);
-		del_timer_sync(&sdata->u.mgd.conn_mon_timer);
-		del_timer_sync(&sdata->u.mgd.bcn_mon_timer);
-		/*
-		 * If any of the timers fired while we waited for it, it will
-		 * have queued its work. Now the work will be running again
-		 * but will not rearm the timer again because it checks
-		 * whether the interface is running, which, at this point,
-		 * it no longer is.
-		 */
-		cancel_work_sync(&sdata->u.mgd.work);
-		cancel_work_sync(&sdata->u.mgd.chswitch_work);
-		cancel_work_sync(&sdata->u.mgd.monitor_work);
-		cancel_work_sync(&sdata->u.mgd.beacon_connection_loss_work);
-
-		/*
-		 * When we get here, the interface is marked down.
-		 * Call synchronize_rcu() to wait for the RX path
-		 * should it be using the interface and enqueuing
-		 * frames at this very time on another CPU.
-		 */
-		synchronize_rcu();
-		skb_queue_purge(&sdata->u.mgd.skb_queue);
-		/* fall through */
-	case NL80211_IFTYPE_ADHOC:
-		if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
-			del_timer_sync(&sdata->u.ibss.timer);
-			cancel_work_sync(&sdata->u.ibss.work);
-			synchronize_rcu();
-			skb_queue_purge(&sdata->u.ibss.skb_queue);
-		}
-		/* fall through */
 	case NL80211_IFTYPE_MESH_POINT:
 		if (ieee80211_vif_is_mesh(&sdata->vif)) {
 			/* other_bss and allmulti are always set on mesh
@@ -518,31 +513,48 @@ static int ieee80211_stop(struct net_device *dev)
 		}
 		/* fall through */
 	default:
+		flush_work(&sdata->work);
+		/*
+		 * When we get here, the interface is marked down.
+		 * Call synchronize_rcu() to wait for the RX path
+		 * should it be using the interface and enqueuing
+		 * frames at this very time on another CPU.
+		 */
+		synchronize_rcu();
+		skb_queue_purge(&sdata->skb_queue);
+
 		if (local->scan_sdata == sdata)
 			ieee80211_scan_cancel(local);
 
 		/*
-		 * Disable beaconing for AP and mesh, IBSS can't
-		 * still be joined to a network at this point.
+		 * Disable beaconing here for mesh only, AP and IBSS
+		 * are already taken care of.
 		 */
-		if (sdata->vif.type == NL80211_IFTYPE_AP ||
-		    sdata->vif.type == NL80211_IFTYPE_MESH_POINT) {
+		if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
 			ieee80211_bss_info_change_notify(sdata,
 				BSS_CHANGED_BEACON_ENABLED);
-		}
 
-		/* disable all keys for as long as this netdev is down */
-		ieee80211_disable_keys(sdata);
-		drv_remove_interface(local, &sdata->vif);
+		/*
+		 * Free all remaining keys, there shouldn't be any,
+		 * except maybe group keys in AP more or WDS?
+		 */
+		ieee80211_free_keys(sdata);
+
+		if (going_down)
+			drv_remove_interface(local, &sdata->vif);
 	}
 
 	sdata->bss = NULL;
 
+	mutex_lock(&local->mtx);
 	hw_reconf_flags |= __ieee80211_recalc_idle(local);
+	mutex_unlock(&local->mtx);
 
 	ieee80211_recalc_ps(local, -1);
 
 	if (local->open_count == 0) {
+		if (local->ops->napi_poll)
+			napi_disable(&local->napi);
 		ieee80211_clear_tx_pending(local);
 		ieee80211_stop_device(local);
 
@@ -565,6 +577,13 @@ static int ieee80211_stop(struct net_device *dev)
 		}
 	}
 	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+}
+
+static int ieee80211_stop(struct net_device *dev)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	ieee80211_do_stop(sdata, true);
 
 	return 0;
 }
@@ -609,8 +628,6 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_local *local = sdata->local;
-	struct beacon_data *beacon;
-	struct sk_buff *skb;
 	int flushed;
 	int i;
 
@@ -623,37 +640,8 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
 		__skb_queue_purge(&sdata->fragments[i].skb_list);
 	sdata->fragment_next = 0;
 
-	switch (sdata->vif.type) {
-	case NL80211_IFTYPE_AP:
-		beacon = sdata->u.ap.beacon;
-		rcu_assign_pointer(sdata->u.ap.beacon, NULL);
-		synchronize_rcu();
-		kfree(beacon);
-
-		while ((skb = skb_dequeue(&sdata->u.ap.ps_bc_buf))) {
-			local->total_ps_buffered--;
-			dev_kfree_skb(skb);
-		}
-
-		break;
-	case NL80211_IFTYPE_MESH_POINT:
-		if (ieee80211_vif_is_mesh(&sdata->vif))
-			mesh_rmc_free(sdata);
-		break;
-	case NL80211_IFTYPE_ADHOC:
-		if (WARN_ON(sdata->u.ibss.presp))
-			kfree_skb(sdata->u.ibss.presp);
-		break;
-	case NL80211_IFTYPE_STATION:
-	case NL80211_IFTYPE_WDS:
-	case NL80211_IFTYPE_AP_VLAN:
-	case NL80211_IFTYPE_MONITOR:
-		break;
-	case NL80211_IFTYPE_UNSPECIFIED:
-	case __NL80211_IFTYPE_AFTER_LAST:
-		BUG();
-		break;
-	}
+	if (ieee80211_vif_is_mesh(&sdata->vif))
+		mesh_rmc_free(sdata);
 
 	flushed = sta_info_flush(local, sdata);
 	WARN_ON(flushed);
@@ -727,6 +715,137 @@ static void ieee80211_if_setup(struct net_device *dev)
 	dev->destructor = free_netdev;
 }
 
+static void ieee80211_iface_work(struct work_struct *work)
+{
+	struct ieee80211_sub_if_data *sdata =
+		container_of(work, struct ieee80211_sub_if_data, work);
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct sta_info *sta;
+	struct ieee80211_ra_tid *ra_tid;
+
+	if (!ieee80211_sdata_running(sdata))
+		return;
+
+	if (local->scanning)
+		return;
+
+	/*
+	 * ieee80211_queue_work() should have picked up most cases,
+	 * here we'll pick the rest.
+	 */
+	if (WARN(local->suspended,
+		 "interface work scheduled while going to suspend\n"))
+		return;
+
+	/* first process frames */
+	while ((skb = skb_dequeue(&sdata->skb_queue))) {
+		struct ieee80211_mgmt *mgmt = (void *)skb->data;
+
+		if (skb->pkt_type == IEEE80211_SDATA_QUEUE_AGG_START) {
+			ra_tid = (void *)&skb->cb;
+			ieee80211_start_tx_ba_cb(&sdata->vif, ra_tid->ra,
+						 ra_tid->tid);
+		} else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_AGG_STOP) {
+			ra_tid = (void *)&skb->cb;
+			ieee80211_stop_tx_ba_cb(&sdata->vif, ra_tid->ra,
+						ra_tid->tid);
+		} else if (ieee80211_is_action(mgmt->frame_control) &&
+			   mgmt->u.action.category == WLAN_CATEGORY_BACK) {
+			int len = skb->len;
+
+			mutex_lock(&local->sta_mtx);
+			sta = sta_info_get_bss(sdata, mgmt->sa);
+			if (sta) {
+				switch (mgmt->u.action.u.addba_req.action_code) {
+				case WLAN_ACTION_ADDBA_REQ:
+					ieee80211_process_addba_request(
+							local, sta, mgmt, len);
+					break;
+				case WLAN_ACTION_ADDBA_RESP:
+					ieee80211_process_addba_resp(local, sta,
+								     mgmt, len);
+					break;
+				case WLAN_ACTION_DELBA:
+					ieee80211_process_delba(sdata, sta,
+								mgmt, len);
+					break;
+				default:
+					WARN_ON(1);
+					break;
+				}
+			}
+			mutex_unlock(&local->sta_mtx);
+		} else if (ieee80211_is_data_qos(mgmt->frame_control)) {
+			struct ieee80211_hdr *hdr = (void *)mgmt;
+			/*
+			 * So the frame isn't mgmt, but frame_control
+			 * is at the right place anyway, of course, so
+			 * the if statement is correct.
+			 *
+			 * Warn if we have other data frame types here,
+			 * they must not get here.
+			 */
+			WARN_ON(hdr->frame_control &
+					cpu_to_le16(IEEE80211_STYPE_NULLFUNC));
+			WARN_ON(!(hdr->seq_ctrl &
+					cpu_to_le16(IEEE80211_SCTL_FRAG)));
+			/*
+			 * This was a fragment of a frame, received while
+			 * a block-ack session was active. That cannot be
+			 * right, so terminate the session.
+			 */
+			mutex_lock(&local->sta_mtx);
+			sta = sta_info_get_bss(sdata, mgmt->sa);
+			if (sta) {
+				u16 tid = *ieee80211_get_qos_ctl(hdr) &
+						IEEE80211_QOS_CTL_TID_MASK;
+
+				__ieee80211_stop_rx_ba_session(
+					sta, tid, WLAN_BACK_RECIPIENT,
+					WLAN_REASON_QSTA_REQUIRE_SETUP,
+					true);
+			}
+			mutex_unlock(&local->sta_mtx);
+		} else switch (sdata->vif.type) {
+		case NL80211_IFTYPE_STATION:
+			ieee80211_sta_rx_queued_mgmt(sdata, skb);
+			break;
+		case NL80211_IFTYPE_ADHOC:
+			ieee80211_ibss_rx_queued_mgmt(sdata, skb);
+			break;
+		case NL80211_IFTYPE_MESH_POINT:
+			if (!ieee80211_vif_is_mesh(&sdata->vif))
+				break;
+			ieee80211_mesh_rx_queued_mgmt(sdata, skb);
+			break;
+		default:
+			WARN(1, "frame for unexpected interface type");
+			break;
+		}
+
+		kfree_skb(skb);
+	}
+
+	/* then other type-dependent work */
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_STATION:
+		ieee80211_sta_work(sdata);
+		break;
+	case NL80211_IFTYPE_ADHOC:
+		ieee80211_ibss_work(sdata);
+		break;
+	case NL80211_IFTYPE_MESH_POINT:
+		if (!ieee80211_vif_is_mesh(&sdata->vif))
+			break;
+		ieee80211_mesh_work(sdata);
+		break;
+	default:
+		break;
+	}
+}
+
+
 /*
  * Helper function to initialise an interface to a specific type.
  */
@@ -738,17 +857,34 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 
 	/* and set some type-dependent values */
 	sdata->vif.type = type;
+	sdata->vif.p2p = false;
 	sdata->dev->netdev_ops = &ieee80211_dataif_ops;
 	sdata->wdev.iftype = type;
 
+	sdata->control_port_protocol = cpu_to_be16(ETH_P_PAE);
+	sdata->control_port_no_encrypt = false;
+
 	/* only monitor differs */
 	sdata->dev->type = ARPHRD_ETHER;
 
+	skb_queue_head_init(&sdata->skb_queue);
+	INIT_WORK(&sdata->work, ieee80211_iface_work);
+
 	switch (type) {
+	case NL80211_IFTYPE_P2P_GO:
+		type = NL80211_IFTYPE_AP;
+		sdata->vif.type = type;
+		sdata->vif.p2p = true;
+		/* fall through */
 	case NL80211_IFTYPE_AP:
 		skb_queue_head_init(&sdata->u.ap.ps_bc_buf);
 		INIT_LIST_HEAD(&sdata->u.ap.vlans);
 		break;
+	case NL80211_IFTYPE_P2P_CLIENT:
+		type = NL80211_IFTYPE_STATION;
+		sdata->vif.type = type;
+		sdata->vif.p2p = true;
+		/* fall through */
 	case NL80211_IFTYPE_STATION:
 		ieee80211_sta_setup_sdata(sdata);
 		break;
@@ -769,7 +905,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 	case NL80211_IFTYPE_AP_VLAN:
 		break;
 	case NL80211_IFTYPE_UNSPECIFIED:
-	case __NL80211_IFTYPE_AFTER_LAST:
+	case NUM_NL80211_IFTYPES:
 		BUG();
 		break;
 	}
@@ -777,12 +913,85 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
 	ieee80211_debugfs_add_netdev(sdata);
 }
 
+static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata,
+					   enum nl80211_iftype type)
+{
+	struct ieee80211_local *local = sdata->local;
+	int ret, err;
+	enum nl80211_iftype internal_type = type;
+	bool p2p = false;
+
+	ASSERT_RTNL();
+
+	if (!local->ops->change_interface)
+		return -EBUSY;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_ADHOC:
+		/*
+		 * Could maybe also all others here?
+		 * Just not sure how that interacts
+		 * with the RX/config path e.g. for
+		 * mesh.
+		 */
+		break;
+	default:
+		return -EBUSY;
+	}
+
+	switch (type) {
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_ADHOC:
+		/*
+		 * Could probably support everything
+		 * but WDS here (WDS do_open can fail
+		 * under memory pressure, which this
+		 * code isn't prepared to handle).
+		 */
+		break;
+	case NL80211_IFTYPE_P2P_CLIENT:
+		p2p = true;
+		internal_type = NL80211_IFTYPE_STATION;
+		break;
+	case NL80211_IFTYPE_P2P_GO:
+		p2p = true;
+		internal_type = NL80211_IFTYPE_AP;
+		break;
+	default:
+		return -EBUSY;
+	}
+
+	ret = ieee80211_check_concurrent_iface(sdata, internal_type);
+	if (ret)
+		return ret;
+
+	ieee80211_do_stop(sdata, false);
+
+	ieee80211_teardown_sdata(sdata->dev);
+
+	ret = drv_change_interface(local, sdata, internal_type, p2p);
+	if (ret)
+		type = sdata->vif.type;
+
+	ieee80211_setup_sdata(sdata, type);
+
+	err = ieee80211_do_open(sdata->dev, false);
+	WARN(err, "type change: do_open returned %d", err);
+
+	return ret;
+}
+
 int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
 			     enum nl80211_iftype type)
 {
+	int ret;
+
 	ASSERT_RTNL();
 
-	if (type == sdata->vif.type)
+	if (type == ieee80211_vif_type_p2p(&sdata->vif))
 		return 0;
 
 	/* Setting ad-hoc mode on non-IBSS channel is not supported. */
@@ -790,18 +999,15 @@ int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata,
 	    type == NL80211_IFTYPE_ADHOC)
 		return -EOPNOTSUPP;
 
-	/*
-	 * We could, here, on changes between IBSS/STA/MESH modes,
-	 * invoke an MLME function instead that disassociates etc.
-	 * and goes into the requested mode.
-	 */
-
-	if (ieee80211_sdata_running(sdata))
-		return -EBUSY;
-
-	/* Purge and reset type-dependent state. */
-	ieee80211_teardown_sdata(sdata->dev);
-	ieee80211_setup_sdata(sdata, type);
+	if (ieee80211_sdata_running(sdata)) {
+		ret = ieee80211_runtime_change_iftype(sdata, type);
+		if (ret)
+			return ret;
+	} else {
+		/* Purge and reset type-dependent state. */
+		ieee80211_teardown_sdata(sdata->dev);
+		ieee80211_setup_sdata(sdata, type);
+	}
 
 	/* reset some values that shouldn't be kept across type changes */
 	sdata->vif.bss_conf.basic_rates =
@@ -969,6 +1175,9 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 	sdata->wdev.wiphy = local->hw.wiphy;
 	sdata->local = local;
 	sdata->dev = ndev;
+#ifdef CONFIG_INET
+	sdata->arp_filter_state = true;
+#endif
 
 	for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++)
 		skb_queue_head_init(&sdata->fragments[i].skb_list);
@@ -1055,8 +1264,7 @@ static u32 ieee80211_idle_off(struct ieee80211_local *local,
 		return 0;
 
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
-	printk(KERN_DEBUG "%s: device no longer idle - %s\n",
-	       wiphy_name(local->hw.wiphy), reason);
+	wiphy_debug(local->hw.wiphy, "device no longer idle - %s\n", reason);
 #endif
 
 	local->hw.conf.flags &= ~IEEE80211_CONF_IDLE;
@@ -1069,8 +1277,7 @@ static u32 ieee80211_idle_on(struct ieee80211_local *local)
 		return 0;
 
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
-	printk(KERN_DEBUG "%s: device now idle\n",
-	       wiphy_name(local->hw.wiphy));
+	wiphy_debug(local->hw.wiphy, "device now idle\n");
 #endif
 
 	drv_flush(local, false);
@@ -1083,28 +1290,61 @@ u32 __ieee80211_recalc_idle(struct ieee80211_local *local)
 {
 	struct ieee80211_sub_if_data *sdata;
 	int count = 0;
+	bool working = false, scanning = false;
+	struct ieee80211_work *wk;
 
-	if (!list_empty(&local->work_list))
-		return ieee80211_idle_off(local, "working");
-
-	if (local->scanning)
-		return ieee80211_idle_off(local, "scanning");
+#ifdef CONFIG_PROVE_LOCKING
+	WARN_ON(debug_locks && !lockdep_rtnl_is_held() &&
+		!lockdep_is_held(&local->iflist_mtx));
+#endif
+	lockdep_assert_held(&local->mtx);
 
 	list_for_each_entry(sdata, &local->interfaces, list) {
-		if (!ieee80211_sdata_running(sdata))
+		if (!ieee80211_sdata_running(sdata)) {
+			sdata->vif.bss_conf.idle = true;
 			continue;
+		}
+
+		sdata->old_idle = sdata->vif.bss_conf.idle;
+
 		/* do not count disabled managed interfaces */
 		if (sdata->vif.type == NL80211_IFTYPE_STATION &&
-		    !sdata->u.mgd.associated)
+		    !sdata->u.mgd.associated) {
+			sdata->vif.bss_conf.idle = true;
 			continue;
+		}
 		/* do not count unused IBSS interfaces */
 		if (sdata->vif.type == NL80211_IFTYPE_ADHOC &&
-		    !sdata->u.ibss.ssid_len)
+		    !sdata->u.ibss.ssid_len) {
+			sdata->vif.bss_conf.idle = true;
 			continue;
+		}
 		/* count everything else */
 		count++;
 	}
 
+	list_for_each_entry(wk, &local->work_list, list) {
+		working = true;
+		wk->sdata->vif.bss_conf.idle = false;
+	}
+
+	if (local->scan_sdata) {
+		scanning = true;
+		local->scan_sdata->vif.bss_conf.idle = false;
+	}
+
+	list_for_each_entry(sdata, &local->interfaces, list) {
+		if (sdata->old_idle == sdata->vif.bss_conf.idle)
+			continue;
+		if (!ieee80211_sdata_running(sdata))
+			continue;
+		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IDLE);
+	}
+
+	if (working)
+		return ieee80211_idle_off(local, "working");
+	if (scanning)
+		return ieee80211_idle_off(local, "scanning");
 	if (!count)
 		return ieee80211_idle_on(local);
 	else
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index e8f6e3b252d..ccd676b2f59 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -36,80 +36,20 @@
  * There is currently no way of knowing this except by looking into
  * debugfs.
  *
- * All key operations are protected internally so you can call them at
- * any time.
+ * All key operations are protected internally.
  *
  * Within mac80211, key references are, just as STA structure references,
  * protected by RCU. Note, however, that some things are unprotected,
  * namely the key->sta dereferences within the hardware acceleration
- * functions. This means that sta_info_destroy() must flush the key todo
- * list.
- *
- * All the direct key list manipulation functions must not sleep because
- * they can operate on STA info structs that are protected by RCU.
+ * functions. This means that sta_info_destroy() must remove the key
+ * which waits for an RCU grace period.
  */
 
 static const u8 bcast_addr[ETH_ALEN] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
 
-/* key mutex: used to synchronise todo runners */
-static DEFINE_MUTEX(key_mutex);
-static DEFINE_SPINLOCK(todo_lock);
-static LIST_HEAD(todo_list);
-
-static void key_todo(struct work_struct *work)
-{
-	ieee80211_key_todo();
-}
-
-static DECLARE_WORK(todo_work, key_todo);
-
-/**
- * add_todo - add todo item for a key
- *
- * @key: key to add to do item for
- * @flag: todo flag(s)
- *
- * Must be called with IRQs or softirqs disabled.
- */
-static void add_todo(struct ieee80211_key *key, u32 flag)
-{
-	if (!key)
-		return;
-
-	spin_lock(&todo_lock);
-	key->flags |= flag;
-	/*
-	 * Remove again if already on the list so that we move it to the end.
-	 */
-	if (!list_empty(&key->todo))
-		list_del(&key->todo);
-	list_add_tail(&key->todo, &todo_list);
-	schedule_work(&todo_work);
-	spin_unlock(&todo_lock);
-}
-
-/**
- * ieee80211_key_lock - lock the mac80211 key operation lock
- *
- * This locks the (global) mac80211 key operation lock, all
- * key operations must be done under this lock.
- */
-static void ieee80211_key_lock(void)
-{
-	mutex_lock(&key_mutex);
-}
-
-/**
- * ieee80211_key_unlock - unlock the mac80211 key operation lock
- */
-static void ieee80211_key_unlock(void)
-{
-	mutex_unlock(&key_mutex);
-}
-
-static void assert_key_lock(void)
+static void assert_key_lock(struct ieee80211_local *local)
 {
-	WARN_ON(!mutex_is_locked(&key_mutex));
+	lockdep_assert_held(&local->key_mtx);
 }
 
 static struct ieee80211_sta *get_sta_for_key(struct ieee80211_key *key)
@@ -120,20 +60,29 @@ static struct ieee80211_sta *get_sta_for_key(struct ieee80211_key *key)
 	return NULL;
 }
 
-static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
+static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
 {
 	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_sta *sta;
 	int ret;
 
-	assert_key_lock();
 	might_sleep();
 
 	if (!key->local->ops->set_key)
-		return;
+		goto out_unsupported;
+
+	assert_key_lock(key->local);
 
 	sta = get_sta_for_key(key);
 
+	/*
+	 * If this is a per-STA GTK, check if it
+	 * is supported; if not, return.
+	 */
+	if (sta && !(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE) &&
+	    !(key->local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK))
+		goto out_unsupported;
+
 	sdata = key->sdata;
 	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 		sdata = container_of(sdata->bss,
@@ -143,16 +92,27 @@ static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
 	ret = drv_set_key(key->local, SET_KEY, sdata, sta, &key->conf);
 
 	if (!ret) {
-		spin_lock_bh(&todo_lock);
 		key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE;
-		spin_unlock_bh(&todo_lock);
+		return 0;
 	}
 
-	if (ret && ret != -ENOSPC && ret != -EOPNOTSUPP)
-		printk(KERN_ERR "mac80211-%s: failed to set key "
-		       "(%d, %pM) to hardware (%d)\n",
-		       wiphy_name(key->local->hw.wiphy),
-		       key->conf.keyidx, sta ? sta->addr : bcast_addr, ret);
+	if (ret != -ENOSPC && ret != -EOPNOTSUPP)
+		wiphy_err(key->local->hw.wiphy,
+			  "failed to set key (%d, %pM) to hardware (%d)\n",
+			  key->conf.keyidx, sta ? sta->addr : bcast_addr, ret);
+
+ out_unsupported:
+	switch (key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_WEP40:
+	case WLAN_CIPHER_SUITE_WEP104:
+	case WLAN_CIPHER_SUITE_TKIP:
+	case WLAN_CIPHER_SUITE_CCMP:
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		/* all of these we can do in software */
+		return 0;
+	default:
+		return -EINVAL;
+	}
 }
 
 static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
@@ -161,18 +121,15 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
 	struct ieee80211_sta *sta;
 	int ret;
 
-	assert_key_lock();
 	might_sleep();
 
 	if (!key || !key->local->ops->set_key)
 		return;
 
-	spin_lock_bh(&todo_lock);
-	if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) {
-		spin_unlock_bh(&todo_lock);
+	assert_key_lock(key->local);
+
+	if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
 		return;
-	}
-	spin_unlock_bh(&todo_lock);
 
 	sta = get_sta_for_key(key);
 	sdata = key->sdata;
@@ -186,37 +143,56 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
 			  sta, &key->conf);
 
 	if (ret)
-		printk(KERN_ERR "mac80211-%s: failed to remove key "
-		       "(%d, %pM) from hardware (%d)\n",
-		       wiphy_name(key->local->hw.wiphy),
-		       key->conf.keyidx, sta ? sta->addr : bcast_addr, ret);
+		wiphy_err(key->local->hw.wiphy,
+			  "failed to remove key (%d, %pM) from hardware (%d)\n",
+			  key->conf.keyidx, sta ? sta->addr : bcast_addr, ret);
 
-	spin_lock_bh(&todo_lock);
 	key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
-	spin_unlock_bh(&todo_lock);
 }
 
+void ieee80211_key_removed(struct ieee80211_key_conf *key_conf)
+{
+	struct ieee80211_key *key;
+
+	key = container_of(key_conf, struct ieee80211_key, conf);
+
+	might_sleep();
+	assert_key_lock(key->local);
+
+	key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
+
+	/*
+	 * Flush TX path to avoid attempts to use this key
+	 * after this function returns. Until then, drivers
+	 * must be prepared to handle the key.
+	 */
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(ieee80211_key_removed);
+
 static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata,
 					int idx)
 {
 	struct ieee80211_key *key = NULL;
 
+	assert_key_lock(sdata->local);
+
 	if (idx >= 0 && idx < NUM_DEFAULT_KEYS)
 		key = sdata->keys[idx];
 
 	rcu_assign_pointer(sdata->default_key, key);
 
-	if (key)
-		add_todo(key, KEY_FLAG_TODO_DEFKEY);
+	if (key) {
+		ieee80211_debugfs_key_remove_default(key->sdata);
+		ieee80211_debugfs_key_add_default(key->sdata);
+	}
 }
 
 void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&sdata->local->key_lock, flags);
+	mutex_lock(&sdata->local->key_mtx);
 	__ieee80211_set_default_key(sdata, idx);
-	spin_unlock_irqrestore(&sdata->local->key_lock, flags);
+	mutex_unlock(&sdata->local->key_mtx);
 }
 
 static void
@@ -224,29 +200,32 @@ __ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, int idx)
 {
 	struct ieee80211_key *key = NULL;
 
+	assert_key_lock(sdata->local);
+
 	if (idx >= NUM_DEFAULT_KEYS &&
 	    idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
 		key = sdata->keys[idx];
 
 	rcu_assign_pointer(sdata->default_mgmt_key, key);
 
-	if (key)
-		add_todo(key, KEY_FLAG_TODO_DEFMGMTKEY);
+	if (key) {
+		ieee80211_debugfs_key_remove_mgmt_default(key->sdata);
+		ieee80211_debugfs_key_add_mgmt_default(key->sdata);
+	}
 }
 
 void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
 				    int idx)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&sdata->local->key_lock, flags);
+	mutex_lock(&sdata->local->key_mtx);
 	__ieee80211_set_default_mgmt_key(sdata, idx);
-	spin_unlock_irqrestore(&sdata->local->key_lock, flags);
+	mutex_unlock(&sdata->local->key_mtx);
 }
 
 
 static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
 				    struct sta_info *sta,
+				    bool pairwise,
 				    struct ieee80211_key *old,
 				    struct ieee80211_key *new)
 {
@@ -255,8 +234,14 @@ static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
 	if (new)
 		list_add(&new->list, &sdata->key_list);
 
-	if (sta) {
-		rcu_assign_pointer(sta->key, new);
+	if (sta && pairwise) {
+		rcu_assign_pointer(sta->ptk, new);
+	} else if (sta) {
+		if (old)
+			idx = old->conf.keyidx;
+		else
+			idx = new->conf.keyidx;
+		rcu_assign_pointer(sta->gtk[idx], new);
 	} else {
 		WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx);
 
@@ -290,20 +275,18 @@ static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
 	}
 }
 
-struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
-					  int idx,
-					  size_t key_len,
+struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
 					  const u8 *key_data,
 					  size_t seq_len, const u8 *seq)
 {
 	struct ieee80211_key *key;
-	int i, j;
+	int i, j, err;
 
 	BUG_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS);
 
 	key = kzalloc(sizeof(struct ieee80211_key) + key_len, GFP_KERNEL);
 	if (!key)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	/*
 	 * Default to software encryption; we'll later upload the
@@ -312,15 +295,16 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
 	key->conf.flags = 0;
 	key->flags = 0;
 
-	key->conf.alg = alg;
+	key->conf.cipher = cipher;
 	key->conf.keyidx = idx;
 	key->conf.keylen = key_len;
-	switch (alg) {
-	case ALG_WEP:
+	switch (cipher) {
+	case WLAN_CIPHER_SUITE_WEP40:
+	case WLAN_CIPHER_SUITE_WEP104:
 		key->conf.iv_len = WEP_IV_LEN;
 		key->conf.icv_len = WEP_ICV_LEN;
 		break;
-	case ALG_TKIP:
+	case WLAN_CIPHER_SUITE_TKIP:
 		key->conf.iv_len = TKIP_IV_LEN;
 		key->conf.icv_len = TKIP_ICV_LEN;
 		if (seq) {
@@ -332,63 +316,76 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
 			}
 		}
 		break;
-	case ALG_CCMP:
+	case WLAN_CIPHER_SUITE_CCMP:
 		key->conf.iv_len = CCMP_HDR_LEN;
 		key->conf.icv_len = CCMP_MIC_LEN;
 		if (seq) {
-			for (i = 0; i < NUM_RX_DATA_QUEUES; i++)
+			for (i = 0; i < NUM_RX_DATA_QUEUES + 1; i++)
 				for (j = 0; j < CCMP_PN_LEN; j++)
 					key->u.ccmp.rx_pn[i][j] =
 						seq[CCMP_PN_LEN - j - 1];
 		}
-		break;
-	case ALG_AES_CMAC:
-		key->conf.iv_len = 0;
-		key->conf.icv_len = sizeof(struct ieee80211_mmie);
-		if (seq)
-			for (j = 0; j < 6; j++)
-				key->u.aes_cmac.rx_pn[j] = seq[6 - j - 1];
-		break;
-	}
-	memcpy(key->conf.key, key_data, key_len);
-	INIT_LIST_HEAD(&key->list);
-	INIT_LIST_HEAD(&key->todo);
-
-	if (alg == ALG_CCMP) {
 		/*
 		 * Initialize AES key state here as an optimization so that
 		 * it does not need to be initialized for every packet.
 		 */
 		key->u.ccmp.tfm = ieee80211_aes_key_setup_encrypt(key_data);
-		if (!key->u.ccmp.tfm) {
+		if (IS_ERR(key->u.ccmp.tfm)) {
+			err = PTR_ERR(key->u.ccmp.tfm);
 			kfree(key);
-			return NULL;
+			key = ERR_PTR(err);
 		}
-	}
-
-	if (alg == ALG_AES_CMAC) {
+		break;
+	case WLAN_CIPHER_SUITE_AES_CMAC:
+		key->conf.iv_len = 0;
+		key->conf.icv_len = sizeof(struct ieee80211_mmie);
+		if (seq)
+			for (j = 0; j < 6; j++)
+				key->u.aes_cmac.rx_pn[j] = seq[6 - j - 1];
 		/*
 		 * Initialize AES key state here as an optimization so that
 		 * it does not need to be initialized for every packet.
 		 */
 		key->u.aes_cmac.tfm =
 			ieee80211_aes_cmac_key_setup(key_data);
-		if (!key->u.aes_cmac.tfm) {
+		if (IS_ERR(key->u.aes_cmac.tfm)) {
+			err = PTR_ERR(key->u.aes_cmac.tfm);
 			kfree(key);
-			return NULL;
+			key = ERR_PTR(err);
 		}
+		break;
 	}
+	memcpy(key->conf.key, key_data, key_len);
+	INIT_LIST_HEAD(&key->list);
 
 	return key;
 }
 
-void ieee80211_key_link(struct ieee80211_key *key,
-			struct ieee80211_sub_if_data *sdata,
-			struct sta_info *sta)
+static void __ieee80211_key_destroy(struct ieee80211_key *key)
+{
+	if (!key)
+		return;
+
+	if (key->local)
+		ieee80211_key_disable_hw_accel(key);
+
+	if (key->conf.cipher == WLAN_CIPHER_SUITE_CCMP)
+		ieee80211_aes_key_free(key->u.ccmp.tfm);
+	if (key->conf.cipher == WLAN_CIPHER_SUITE_AES_CMAC)
+		ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm);
+	if (key->local)
+		ieee80211_debugfs_key_remove(key);
+
+	kfree(key);
+}
+
+int ieee80211_key_link(struct ieee80211_key *key,
+		       struct ieee80211_sub_if_data *sdata,
+		       struct sta_info *sta)
 {
 	struct ieee80211_key *old_key;
-	unsigned long flags;
-	int idx;
+	int idx, ret;
+	bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE;
 
 	BUG_ON(!sdata);
 	BUG_ON(!key);
@@ -405,13 +402,6 @@ void ieee80211_key_link(struct ieee80211_key *key,
 		 */
 		if (test_sta_flags(sta, WLAN_STA_WME))
 			key->conf.flags |= IEEE80211_KEY_FLAG_WMM_STA;
-
-		/*
-		 * This key is for a specific sta interface,
-		 * inform the driver that it should try to store
-		 * this key as pairwise key.
-		 */
-		key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE;
 	} else {
 		if (sdata->vif.type == NL80211_IFTYPE_STATION) {
 			struct sta_info *ap;
@@ -431,23 +421,25 @@ void ieee80211_key_link(struct ieee80211_key *key,
 		}
 	}
 
-	spin_lock_irqsave(&sdata->local->key_lock, flags);
+	mutex_lock(&sdata->local->key_mtx);
 
-	if (sta)
-		old_key = sta->key;
+	if (sta && pairwise)
+		old_key = sta->ptk;
+	else if (sta)
+		old_key = sta->gtk[idx];
 	else
 		old_key = sdata->keys[idx];
 
-	__ieee80211_key_replace(sdata, sta, old_key, key);
+	__ieee80211_key_replace(sdata, sta, pairwise, old_key, key);
+	__ieee80211_key_destroy(old_key);
+
+	ieee80211_debugfs_key_add(key);
 
-	/* free old key later */
-	add_todo(old_key, KEY_FLAG_TODO_DELETE);
+	ret = ieee80211_key_enable_hw_accel(key);
 
-	add_todo(key, KEY_FLAG_TODO_ADD_DEBUGFS);
-	if (ieee80211_sdata_running(sdata))
-		add_todo(key, KEY_FLAG_TODO_HWACCEL_ADD);
+	mutex_unlock(&sdata->local->key_mtx);
 
-	spin_unlock_irqrestore(&sdata->local->key_lock, flags);
+	return ret;
 }
 
 static void __ieee80211_key_free(struct ieee80211_key *key)
@@ -457,171 +449,64 @@ static void __ieee80211_key_free(struct ieee80211_key *key)
 	 */
 	if (key->sdata)
 		__ieee80211_key_replace(key->sdata, key->sta,
-					key, NULL);
-
-	add_todo(key, KEY_FLAG_TODO_DELETE);
+				key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE,
+				key, NULL);
+	__ieee80211_key_destroy(key);
 }
 
-void ieee80211_key_free(struct ieee80211_key *key)
+void ieee80211_key_free(struct ieee80211_local *local,
+			struct ieee80211_key *key)
 {
-	unsigned long flags;
-
 	if (!key)
 		return;
 
-	if (!key->sdata) {
-		/* The key has not been linked yet, simply free it
-		 * and don't Oops */
-		if (key->conf.alg == ALG_CCMP)
-			ieee80211_aes_key_free(key->u.ccmp.tfm);
-		kfree(key);
-		return;
-	}
-
-	spin_lock_irqsave(&key->sdata->local->key_lock, flags);
+	mutex_lock(&local->key_mtx);
 	__ieee80211_key_free(key);
-	spin_unlock_irqrestore(&key->sdata->local->key_lock, flags);
+	mutex_unlock(&local->key_mtx);
 }
 
-/*
- * To be safe against concurrent manipulations of the list (which shouldn't
- * actually happen) we need to hold the spinlock. But under the spinlock we
- * can't actually do much, so we defer processing to the todo list. Then run
- * the todo list to be sure the operation and possibly previously pending
- * operations are completed.
- */
-static void ieee80211_todo_for_each_key(struct ieee80211_sub_if_data *sdata,
-					u32 todo_flags)
+void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_key *key;
-	unsigned long flags;
-
-	might_sleep();
-
-	spin_lock_irqsave(&sdata->local->key_lock, flags);
-	list_for_each_entry(key, &sdata->key_list, list)
-		add_todo(key, todo_flags);
-	spin_unlock_irqrestore(&sdata->local->key_lock, flags);
-
-	ieee80211_key_todo();
-}
 
-void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata)
-{
 	ASSERT_RTNL();
 
 	if (WARN_ON(!ieee80211_sdata_running(sdata)))
 		return;
 
-	ieee80211_todo_for_each_key(sdata, KEY_FLAG_TODO_HWACCEL_ADD);
-}
+	mutex_lock(&sdata->local->key_mtx);
 
-void ieee80211_disable_keys(struct ieee80211_sub_if_data *sdata)
-{
-	ASSERT_RTNL();
+	list_for_each_entry(key, &sdata->key_list, list)
+		ieee80211_key_enable_hw_accel(key);
 
-	ieee80211_todo_for_each_key(sdata, KEY_FLAG_TODO_HWACCEL_REMOVE);
+	mutex_unlock(&sdata->local->key_mtx);
 }
 
-static void __ieee80211_key_destroy(struct ieee80211_key *key)
-{
-	if (!key)
-		return;
-
-	ieee80211_key_disable_hw_accel(key);
-
-	if (key->conf.alg == ALG_CCMP)
-		ieee80211_aes_key_free(key->u.ccmp.tfm);
-	if (key->conf.alg == ALG_AES_CMAC)
-		ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm);
-	ieee80211_debugfs_key_remove(key);
-
-	kfree(key);
-}
-
-static void __ieee80211_key_todo(void)
+void ieee80211_disable_keys(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_key *key;
-	bool work_done;
-	u32 todoflags;
 
-	/*
-	 * NB: sta_info_destroy relies on this!
-	 */
-	synchronize_rcu();
-
-	spin_lock_bh(&todo_lock);
-	while (!list_empty(&todo_list)) {
-		key = list_first_entry(&todo_list, struct ieee80211_key, todo);
-		list_del_init(&key->todo);
-		todoflags = key->flags & (KEY_FLAG_TODO_ADD_DEBUGFS |
-					  KEY_FLAG_TODO_DEFKEY |
-					  KEY_FLAG_TODO_DEFMGMTKEY |
-					  KEY_FLAG_TODO_HWACCEL_ADD |
-					  KEY_FLAG_TODO_HWACCEL_REMOVE |
-					  KEY_FLAG_TODO_DELETE);
-		key->flags &= ~todoflags;
-		spin_unlock_bh(&todo_lock);
-
-		work_done = false;
-
-		if (todoflags & KEY_FLAG_TODO_ADD_DEBUGFS) {
-			ieee80211_debugfs_key_add(key);
-			work_done = true;
-		}
-		if (todoflags & KEY_FLAG_TODO_DEFKEY) {
-			ieee80211_debugfs_key_remove_default(key->sdata);
-			ieee80211_debugfs_key_add_default(key->sdata);
-			work_done = true;
-		}
-		if (todoflags & KEY_FLAG_TODO_DEFMGMTKEY) {
-			ieee80211_debugfs_key_remove_mgmt_default(key->sdata);
-			ieee80211_debugfs_key_add_mgmt_default(key->sdata);
-			work_done = true;
-		}
-		if (todoflags & KEY_FLAG_TODO_HWACCEL_ADD) {
-			ieee80211_key_enable_hw_accel(key);
-			work_done = true;
-		}
-		if (todoflags & KEY_FLAG_TODO_HWACCEL_REMOVE) {
-			ieee80211_key_disable_hw_accel(key);
-			work_done = true;
-		}
-		if (todoflags & KEY_FLAG_TODO_DELETE) {
-			__ieee80211_key_destroy(key);
-			work_done = true;
-		}
+	ASSERT_RTNL();
 
-		WARN_ON(!work_done);
+	mutex_lock(&sdata->local->key_mtx);
 
-		spin_lock_bh(&todo_lock);
-	}
-	spin_unlock_bh(&todo_lock);
-}
+	list_for_each_entry(key, &sdata->key_list, list)
+		ieee80211_key_disable_hw_accel(key);
 
-void ieee80211_key_todo(void)
-{
-	ieee80211_key_lock();
-	__ieee80211_key_todo();
-	ieee80211_key_unlock();
+	mutex_unlock(&sdata->local->key_mtx);
 }
 
 void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_key *key, *tmp;
-	unsigned long flags;
 
-	ieee80211_key_lock();
+	mutex_lock(&sdata->local->key_mtx);
 
 	ieee80211_debugfs_key_remove_default(sdata);
 	ieee80211_debugfs_key_remove_mgmt_default(sdata);
 
-	spin_lock_irqsave(&sdata->local->key_lock, flags);
 	list_for_each_entry_safe(key, tmp, &sdata->key_list, list)
 		__ieee80211_key_free(key);
-	spin_unlock_irqrestore(&sdata->local->key_lock, flags);
-
-	__ieee80211_key_todo();
 
-	ieee80211_key_unlock();
+	mutex_unlock(&sdata->local->key_mtx);
 }
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index bdc2968c2bb..0db1c0f5f69 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -16,6 +16,9 @@
 #include <linux/rcupdate.h>
 #include <net/mac80211.h>
 
+#define NUM_DEFAULT_KEYS 4
+#define NUM_DEFAULT_MGMT_KEYS 2
+
 #define WEP_IV_LEN		4
 #define WEP_ICV_LEN		4
 #define ALG_TKIP_KEY_LEN	32
@@ -38,25 +41,9 @@ struct sta_info;
  *
  * @KEY_FLAG_UPLOADED_TO_HARDWARE: Indicates that this key is present
  *	in the hardware for TX crypto hardware acceleration.
- * @KEY_FLAG_TODO_DELETE: Key is marked for deletion and will, after an
- *	RCU grace period, no longer be reachable other than from the
- *	todo list.
- * @KEY_FLAG_TODO_HWACCEL_ADD: Key needs to be added to hardware acceleration.
- * @KEY_FLAG_TODO_HWACCEL_REMOVE: Key needs to be removed from hardware
- *	acceleration.
- * @KEY_FLAG_TODO_DEFKEY: Key is default key and debugfs needs to be updated.
- * @KEY_FLAG_TODO_ADD_DEBUGFS: Key needs to be added to debugfs.
- * @KEY_FLAG_TODO_DEFMGMTKEY: Key is default management key and debugfs needs
- *	to be updated.
  */
 enum ieee80211_internal_key_flags {
 	KEY_FLAG_UPLOADED_TO_HARDWARE	= BIT(0),
-	KEY_FLAG_TODO_DELETE		= BIT(1),
-	KEY_FLAG_TODO_HWACCEL_ADD	= BIT(2),
-	KEY_FLAG_TODO_HWACCEL_REMOVE	= BIT(3),
-	KEY_FLAG_TODO_DEFKEY		= BIT(4),
-	KEY_FLAG_TODO_ADD_DEBUGFS	= BIT(5),
-	KEY_FLAG_TODO_DEFMGMTKEY	= BIT(6),
 };
 
 enum ieee80211_internal_tkip_state {
@@ -79,10 +66,8 @@ struct ieee80211_key {
 
 	/* for sdata list */
 	struct list_head list;
-	/* for todo list */
-	struct list_head todo;
 
-	/* protected by todo lock! */
+	/* protected by key mutex */
 	unsigned int flags;
 
 	union {
@@ -95,7 +80,13 @@ struct ieee80211_key {
 		} tkip;
 		struct {
 			u8 tx_pn[6];
-			u8 rx_pn[NUM_RX_DATA_QUEUES][6];
+			/*
+			 * Last received packet number. The first
+			 * NUM_RX_DATA_QUEUES counters are used with Data
+			 * frames and the last counter is used with Robust
+			 * Management frames.
+			 */
+			u8 rx_pn[NUM_RX_DATA_QUEUES + 1][6];
 			struct crypto_cipher *tfm;
 			u32 replays; /* dot11RSNAStatsCCMPReplays */
 			/* scratch buffers for virt_to_page() (crypto API) */
@@ -135,19 +126,18 @@ struct ieee80211_key {
 	struct ieee80211_key_conf conf;
 };
 
-struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg,
-					  int idx,
-					  size_t key_len,
+struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
 					  const u8 *key_data,
 					  size_t seq_len, const u8 *seq);
 /*
  * Insert a key into data structures (sdata, sta if necessary)
  * to make it used, free old key.
  */
-void ieee80211_key_link(struct ieee80211_key *key,
-			struct ieee80211_sub_if_data *sdata,
-			struct sta_info *sta);
-void ieee80211_key_free(struct ieee80211_key *key);
+int __must_check ieee80211_key_link(struct ieee80211_key *key,
+				    struct ieee80211_sub_if_data *sdata,
+				    struct sta_info *sta);
+void ieee80211_key_free(struct ieee80211_local *local,
+			struct ieee80211_key *key);
 void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx);
 void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
 				    int idx);
@@ -155,6 +145,4 @@ void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata);
 void ieee80211_enable_keys(struct ieee80211_sub_if_data *sdata);
 void ieee80211_disable_keys(struct ieee80211_sub_if_data *sdata);
 
-void ieee80211_key_todo(void);
-
 #endif /* IEEE80211_KEY_H */
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 22a384dfab6..107a0cbe52a 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -20,6 +20,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/bitmap.h>
 #include <linux/pm_qos_params.h>
+#include <linux/inetdevice.h>
 #include <net/net_namespace.h>
 #include <net/cfg80211.h>
 
@@ -53,6 +54,9 @@ void ieee80211_configure_filter(struct ieee80211_local *local)
 	if (local->monitors || local->scanning)
 		new_flags |= FIF_BCN_PRBRESP_PROMISC;
 
+	if (local->fif_probe_req || local->probe_req_reg)
+		new_flags |= FIF_PROBE_REQ;
+
 	if (local->fif_fcsfail)
 		new_flags |= FIF_FCSFAIL;
 
@@ -98,23 +102,30 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
 	int ret = 0;
 	int power;
 	enum nl80211_channel_type channel_type;
+	u32 offchannel_flag;
 
 	might_sleep();
 
 	scan_chan = local->scan_channel;
 
+	offchannel_flag = local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL;
 	if (scan_chan) {
 		chan = scan_chan;
 		channel_type = NL80211_CHAN_NO_HT;
-	} else if (local->tmp_channel) {
+		local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL;
+	} else if (local->tmp_channel &&
+		   local->oper_channel != local->tmp_channel) {
 		chan = scan_chan = local->tmp_channel;
 		channel_type = local->tmp_channel_type;
+		local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL;
 	} else {
 		chan = local->oper_channel;
 		channel_type = local->_oper_channel_type;
+		local->hw.conf.flags &= ~IEEE80211_CONF_OFFCHANNEL;
 	}
+	offchannel_flag ^= local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL;
 
-	if (chan != local->hw.conf.channel ||
+	if (offchannel_flag || chan != local->hw.conf.channel ||
 	    channel_type != local->hw.conf.channel_type) {
 		local->hw.conf.channel = chan;
 		local->hw.conf.channel_type = channel_type;
@@ -193,6 +204,8 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 		sdata->vif.bss_conf.bssid = sdata->u.ibss.bssid;
 	else if (sdata->vif.type == NL80211_IFTYPE_AP)
 		sdata->vif.bss_conf.bssid = sdata->vif.addr;
+	else if (sdata->vif.type == NL80211_IFTYPE_WDS)
+		sdata->vif.bss_conf.bssid = NULL;
 	else if (ieee80211_vif_is_mesh(&sdata->vif)) {
 		sdata->vif.bss_conf.bssid = zero;
 	} else {
@@ -203,6 +216,7 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_ADHOC:
+	case NL80211_IFTYPE_WDS:
 	case NL80211_IFTYPE_MESH_POINT:
 		break;
 	default:
@@ -259,7 +273,6 @@ static void ieee80211_tasklet_handler(unsigned long data)
 {
 	struct ieee80211_local *local = (struct ieee80211_local *) data;
 	struct sk_buff *skb;
-	struct ieee80211_ra_tid *ra_tid;
 
 	while ((skb = skb_dequeue(&local->skb_queue)) ||
 	       (skb = skb_dequeue(&local->skb_queue_unreliable))) {
@@ -274,18 +287,6 @@ static void ieee80211_tasklet_handler(unsigned long data)
 			skb->pkt_type = 0;
 			ieee80211_tx_status(local_to_hw(local), skb);
 			break;
-		case IEEE80211_DELBA_MSG:
-			ra_tid = (struct ieee80211_ra_tid *) &skb->cb;
-			ieee80211_stop_tx_ba_cb(ra_tid->vif, ra_tid->ra,
-						ra_tid->tid);
-			dev_kfree_skb(skb);
-			break;
-		case IEEE80211_ADDBA_MSG:
-			ra_tid = (struct ieee80211_ra_tid *) &skb->cb;
-			ieee80211_start_tx_ba_cb(ra_tid->vif, ra_tid->ra,
-						 ra_tid->tid);
-			dev_kfree_skb(skb);
-			break ;
 		default:
 			WARN(1, "mac80211: Packet is of unknown type %d\n",
 			     skb->pkt_type);
@@ -300,7 +301,16 @@ static void ieee80211_restart_work(struct work_struct *work)
 	struct ieee80211_local *local =
 		container_of(work, struct ieee80211_local, restart_work);
 
+	/* wait for scan work complete */
+	flush_workqueue(local->workqueue);
+
+	mutex_lock(&local->mtx);
+	WARN(test_bit(SCAN_HW_SCANNING, &local->scanning),
+		"%s called with hardware scan in progress\n", __func__);
+	mutex_unlock(&local->mtx);
+
 	rtnl_lock();
+	ieee80211_scan_cancel(local);
 	ieee80211_reconfig(local);
 	rtnl_unlock();
 }
@@ -311,7 +321,7 @@ void ieee80211_restart_hw(struct ieee80211_hw *hw)
 
 	trace_api_restart_hw(local);
 
-	/* use this reason, __ieee80211_resume will unblock it */
+	/* use this reason, ieee80211_reconfig will unblock it */
 	ieee80211_stop_queues_by_reason(hw,
 		IEEE80211_QUEUE_STOP_REASON_SUSPEND);
 
@@ -325,10 +335,154 @@ static void ieee80211_recalc_smps_work(struct work_struct *work)
 		container_of(work, struct ieee80211_local, recalc_smps);
 
 	mutex_lock(&local->iflist_mtx);
-	ieee80211_recalc_smps(local, NULL);
+	ieee80211_recalc_smps(local);
 	mutex_unlock(&local->iflist_mtx);
 }
 
+#ifdef CONFIG_INET
+static int ieee80211_ifa_changed(struct notifier_block *nb,
+				 unsigned long data, void *arg)
+{
+	struct in_ifaddr *ifa = arg;
+	struct ieee80211_local *local =
+		container_of(nb, struct ieee80211_local,
+			     ifa_notifier);
+	struct net_device *ndev = ifa->ifa_dev->dev;
+	struct wireless_dev *wdev = ndev->ieee80211_ptr;
+	struct in_device *idev;
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_bss_conf *bss_conf;
+	struct ieee80211_if_managed *ifmgd;
+	int c = 0;
+
+	/* Make sure it's our interface that got changed */
+	if (!wdev)
+		return NOTIFY_DONE;
+
+	if (wdev->wiphy != local->hw.wiphy)
+		return NOTIFY_DONE;
+
+	sdata = IEEE80211_DEV_TO_SUB_IF(ndev);
+	bss_conf = &sdata->vif.bss_conf;
+
+	if (!ieee80211_sdata_running(sdata))
+		return NOTIFY_DONE;
+
+	/* ARP filtering is only supported in managed mode */
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
+		return NOTIFY_DONE;
+
+	idev = __in_dev_get_rtnl(sdata->dev);
+	if (!idev)
+		return NOTIFY_DONE;
+
+	ifmgd = &sdata->u.mgd;
+	mutex_lock(&ifmgd->mtx);
+
+	/* Copy the addresses to the bss_conf list */
+	ifa = idev->ifa_list;
+	while (c < IEEE80211_BSS_ARP_ADDR_LIST_LEN && ifa) {
+		bss_conf->arp_addr_list[c] = ifa->ifa_address;
+		ifa = ifa->ifa_next;
+		c++;
+	}
+
+	/* If not all addresses fit the list, disable filtering */
+	if (ifa) {
+		sdata->arp_filter_state = false;
+		c = 0;
+	} else {
+		sdata->arp_filter_state = true;
+	}
+	bss_conf->arp_addr_cnt = c;
+
+	/* Configure driver only if associated */
+	if (ifmgd->associated) {
+		bss_conf->arp_filter_enabled = sdata->arp_filter_state;
+		ieee80211_bss_info_change_notify(sdata,
+						 BSS_CHANGED_ARP_FILTER);
+	}
+
+	mutex_unlock(&ifmgd->mtx);
+
+	return NOTIFY_DONE;
+}
+#endif
+
+static int ieee80211_napi_poll(struct napi_struct *napi, int budget)
+{
+	struct ieee80211_local *local =
+		container_of(napi, struct ieee80211_local, napi);
+
+	return local->ops->napi_poll(&local->hw, budget);
+}
+
+void ieee80211_napi_schedule(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	napi_schedule(&local->napi);
+}
+EXPORT_SYMBOL(ieee80211_napi_schedule);
+
+void ieee80211_napi_complete(struct ieee80211_hw *hw)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	napi_complete(&local->napi);
+}
+EXPORT_SYMBOL(ieee80211_napi_complete);
+
+/* There isn't a lot of sense in it, but you can transmit anything you like */
+static const struct ieee80211_txrx_stypes
+ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = {
+	[NL80211_IFTYPE_ADHOC] = {
+		.tx = 0xffff,
+		.rx = BIT(IEEE80211_STYPE_ACTION >> 4),
+	},
+	[NL80211_IFTYPE_STATION] = {
+		.tx = 0xffff,
+		.rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
+			BIT(IEEE80211_STYPE_PROBE_REQ >> 4),
+	},
+	[NL80211_IFTYPE_AP] = {
+		.tx = 0xffff,
+		.rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) |
+			BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) |
+			BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
+			BIT(IEEE80211_STYPE_DISASSOC >> 4) |
+			BIT(IEEE80211_STYPE_AUTH >> 4) |
+			BIT(IEEE80211_STYPE_DEAUTH >> 4) |
+			BIT(IEEE80211_STYPE_ACTION >> 4),
+	},
+	[NL80211_IFTYPE_AP_VLAN] = {
+		/* copy AP */
+		.tx = 0xffff,
+		.rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) |
+			BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) |
+			BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
+			BIT(IEEE80211_STYPE_DISASSOC >> 4) |
+			BIT(IEEE80211_STYPE_AUTH >> 4) |
+			BIT(IEEE80211_STYPE_DEAUTH >> 4) |
+			BIT(IEEE80211_STYPE_ACTION >> 4),
+	},
+	[NL80211_IFTYPE_P2P_CLIENT] = {
+		.tx = 0xffff,
+		.rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
+			BIT(IEEE80211_STYPE_PROBE_REQ >> 4),
+	},
+	[NL80211_IFTYPE_P2P_GO] = {
+		.tx = 0xffff,
+		.rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) |
+			BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) |
+			BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
+			BIT(IEEE80211_STYPE_DISASSOC >> 4) |
+			BIT(IEEE80211_STYPE_AUTH >> 4) |
+			BIT(IEEE80211_STYPE_DEAUTH >> 4) |
+			BIT(IEEE80211_STYPE_ACTION >> 4),
+	},
+};
+
 struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 					const struct ieee80211_ops *ops)
 {
@@ -358,6 +512,8 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 	if (!wiphy)
 		return NULL;
 
+	wiphy->mgmt_stypes = ieee80211_default_mgmt_stypes;
+
 	wiphy->flags |= WIPHY_FLAG_NETNS_OK |
 			WIPHY_FLAG_4ADDR_AP |
 			WIPHY_FLAG_4ADDR_STATION;
@@ -383,6 +539,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 	/* set up some defaults */
 	local->hw.queues = 1;
 	local->hw.max_rates = 1;
+	local->hw.max_report_rates = 0;
 	local->hw.conf.long_frame_max_tx_count = wiphy->retry_long;
 	local->hw.conf.short_frame_max_tx_count = wiphy->retry_short;
 	local->user_power_level = -1;
@@ -394,9 +551,9 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 	__hw_addr_init(&local->mc_list);
 
 	mutex_init(&local->iflist_mtx);
-	mutex_init(&local->scan_mtx);
+	mutex_init(&local->mtx);
 
-	spin_lock_init(&local->key_lock);
+	mutex_init(&local->key_mtx);
 	spin_lock_init(&local->filter_lock);
 	spin_lock_init(&local->queue_stop_reason_lock);
 
@@ -419,8 +576,10 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 
 	sta_info_init(local);
 
-	for (i = 0; i < IEEE80211_MAX_QUEUES; i++)
+	for (i = 0; i < IEEE80211_MAX_QUEUES; i++) {
 		skb_queue_head_init(&local->pending[i]);
+		atomic_set(&local->agg_queue_stop[i], 0);
+	}
 	tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending,
 		     (unsigned long)local);
 
@@ -431,7 +590,8 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
 	skb_queue_head_init(&local->skb_queue);
 	skb_queue_head_init(&local->skb_queue_unreliable);
 
-	spin_lock_init(&local->ampdu_lock);
+	/* init dummy netdev for use w/ NAPI */
+	init_dummy_netdev(&local->napi_dev);
 
 	return local_to_hw(local);
 }
@@ -445,6 +605,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	int channels, max_bitrates;
 	bool supp_ht;
 	static const u32 cipher_suites[] = {
+		/* keep WEP first, it may be removed below */
 		WLAN_CIPHER_SUITE_WEP40,
 		WLAN_CIPHER_SUITE_WEP104,
 		WLAN_CIPHER_SUITE_TKIP,
@@ -454,6 +615,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 		WLAN_CIPHER_SUITE_AES_CMAC
 	};
 
+	if (hw->max_report_rates == 0)
+		hw->max_report_rates = hw->max_rates;
+
 	/*
 	 * generic code guarantees at least one band,
 	 * set this very early because much code assumes
@@ -493,6 +657,14 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	/* mac80211 always supports monitor */
 	local->hw.wiphy->interface_modes |= BIT(NL80211_IFTYPE_MONITOR);
 
+#ifndef CONFIG_MAC80211_MESH
+	/* mesh depends on Kconfig, but drivers should set it if they want */
+	local->hw.wiphy->interface_modes &= ~BIT(NL80211_IFTYPE_MESH_POINT);
+#endif
+
+	/* mac80211 supports control port protocol changing */
+	local->hw.wiphy->flags |= WIPHY_FLAG_CONTROL_PORT_PROTOCOL;
+
 	if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
 		local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM;
 	else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
@@ -505,10 +677,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	/*
 	 * Calculate scan IE length -- we need this to alloc
 	 * memory and to subtract from the driver limit. It
-	 * includes the (extended) supported rates and HT
+	 * includes the DS Params, (extended) supported rates, and HT
 	 * information -- SSID is the driver's responsibility.
 	 */
-	local->scan_ies_len = 4 + max_bitrates; /* (ext) supp rates */
+	local->scan_ies_len = 4 + max_bitrates /* (ext) supp rates */ +
+		3 /* DS Params */;
 	if (supp_ht)
 		local->scan_ies_len += 2 + sizeof(struct ieee80211_ht_cap);
 
@@ -528,10 +701,41 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 	if (local->hw.wiphy->max_scan_ie_len)
 		local->hw.wiphy->max_scan_ie_len -= local->scan_ies_len;
 
-	local->hw.wiphy->cipher_suites = cipher_suites;
-	local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites);
-	if (!(local->hw.flags & IEEE80211_HW_MFP_CAPABLE))
-		local->hw.wiphy->n_cipher_suites--;
+	/* Set up cipher suites unless driver already did */
+	if (!local->hw.wiphy->cipher_suites) {
+		local->hw.wiphy->cipher_suites = cipher_suites;
+		local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites);
+		if (!(local->hw.flags & IEEE80211_HW_MFP_CAPABLE))
+			local->hw.wiphy->n_cipher_suites--;
+	}
+	if (IS_ERR(local->wep_tx_tfm) || IS_ERR(local->wep_rx_tfm)) {
+		if (local->hw.wiphy->cipher_suites == cipher_suites) {
+			local->hw.wiphy->cipher_suites += 2;
+			local->hw.wiphy->n_cipher_suites -= 2;
+		} else {
+			u32 *suites;
+			int r, w = 0;
+
+			/* Filter out WEP */
+
+			suites = kmemdup(
+				local->hw.wiphy->cipher_suites,
+				sizeof(u32) * local->hw.wiphy->n_cipher_suites,
+				GFP_KERNEL);
+			if (!suites)
+				return -ENOMEM;
+			for (r = 0; r < local->hw.wiphy->n_cipher_suites; r++) {
+				u32 suite = local->hw.wiphy->cipher_suites[r];
+				if (suite == WLAN_CIPHER_SUITE_WEP40 ||
+				    suite == WLAN_CIPHER_SUITE_WEP104)
+					continue;
+				suites[w++] = suite;
+			}
+			local->hw.wiphy->cipher_suites = suites;
+			local->hw.wiphy->n_cipher_suites = w;
+			local->wiphy_ciphers_allocated = true;
+		}
+	}
 
 	result = wiphy_register(local->hw.wiphy);
 	if (result < 0)
@@ -545,7 +749,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 		hw->queues = IEEE80211_MAX_QUEUES;
 
 	local->workqueue =
-		create_singlethread_workqueue(wiphy_name(local->hw.wiphy));
+		alloc_ordered_workqueue(wiphy_name(local->hw.wiphy), 0);
 	if (!local->workqueue) {
 		result = -ENOMEM;
 		goto fail_workqueue;
@@ -572,26 +776,24 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 
 	local->hw.conf.listen_interval = local->hw.max_listen_interval;
 
-	local->hw.conf.dynamic_ps_forced_timeout = -1;
+	local->dynamic_ps_forced_timeout = -1;
 
 	result = sta_info_start(local);
 	if (result < 0)
 		goto fail_sta_info;
 
 	result = ieee80211_wep_init(local);
-	if (result < 0) {
-		printk(KERN_DEBUG "%s: Failed to initialize wep: %d\n",
-		       wiphy_name(local->hw.wiphy), result);
-		goto fail_wep;
-	}
+	if (result < 0)
+		wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n",
+			    result);
 
 	rtnl_lock();
 
 	result = ieee80211_init_rate_ctrl_alg(local,
 					      hw->rate_control_algorithm);
 	if (result < 0) {
-		printk(KERN_DEBUG "%s: Failed to initialize rate control "
-		       "algorithm\n", wiphy_name(local->hw.wiphy));
+		wiphy_debug(local->hw.wiphy,
+			    "Failed to initialize rate control algorithm\n");
 		goto fail_rate;
 	}
 
@@ -600,8 +802,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 		result = ieee80211_if_add(local, "wlan%d", NULL,
 					  NL80211_IFTYPE_STATION, NULL);
 		if (result)
-			printk(KERN_WARNING "%s: Failed to add default virtual iface\n",
-			       wiphy_name(local->hw.wiphy));
+			wiphy_warn(local->hw.wiphy,
+				   "Failed to add default virtual iface\n");
 	}
 
 	rtnl_unlock();
@@ -612,27 +814,43 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 		ieee80211_max_network_latency;
 	result = pm_qos_add_notifier(PM_QOS_NETWORK_LATENCY,
 				     &local->network_latency_notifier);
-
 	if (result) {
 		rtnl_lock();
 		goto fail_pm_qos;
 	}
 
+#ifdef CONFIG_INET
+	local->ifa_notifier.notifier_call = ieee80211_ifa_changed;
+	result = register_inetaddr_notifier(&local->ifa_notifier);
+	if (result)
+		goto fail_ifa;
+#endif
+
+	netif_napi_add(&local->napi_dev, &local->napi, ieee80211_napi_poll,
+			local->hw.napi_weight);
+
 	return 0;
 
+#ifdef CONFIG_INET
+ fail_ifa:
+	pm_qos_remove_notifier(PM_QOS_NETWORK_LATENCY,
+			       &local->network_latency_notifier);
+	rtnl_lock();
+#endif
  fail_pm_qos:
 	ieee80211_led_exit(local);
 	ieee80211_remove_interfaces(local);
  fail_rate:
 	rtnl_unlock();
 	ieee80211_wep_free(local);
- fail_wep:
 	sta_info_stop(local);
  fail_sta_info:
 	destroy_workqueue(local->workqueue);
  fail_workqueue:
 	wiphy_unregister(local->hw.wiphy);
  fail_wiphy_register:
+	if (local->wiphy_ciphers_allocated)
+		kfree(local->hw.wiphy->cipher_suites);
 	kfree(local->int_scan_req);
 	return result;
 }
@@ -647,6 +865,9 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
 
 	pm_qos_remove_notifier(PM_QOS_NETWORK_LATENCY,
 			       &local->network_latency_notifier);
+#ifdef CONFIG_INET
+	unregister_inetaddr_notifier(&local->ifa_notifier);
+#endif
 
 	rtnl_lock();
 
@@ -659,6 +880,13 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
 
 	rtnl_unlock();
 
+	/*
+	 * Now all work items will be gone, but the
+	 * timer might still be armed, so delete it
+	 */
+	del_timer_sync(&local->work_timer);
+
+	cancel_work_sync(&local->restart_work);
 	cancel_work_sync(&local->reconfig_filter);
 
 	ieee80211_clear_tx_pending(local);
@@ -667,8 +895,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
 
 	if (skb_queue_len(&local->skb_queue) ||
 	    skb_queue_len(&local->skb_queue_unreliable))
-		printk(KERN_WARNING "%s: skb_queue not empty\n",
-		       wiphy_name(local->hw.wiphy));
+		wiphy_warn(local->hw.wiphy, "skb_queue not empty\n");
 	skb_queue_purge(&local->skb_queue);
 	skb_queue_purge(&local->skb_queue_unreliable);
 
@@ -685,7 +912,10 @@ void ieee80211_free_hw(struct ieee80211_hw *hw)
 	struct ieee80211_local *local = hw_to_local(hw);
 
 	mutex_destroy(&local->iflist_mtx);
-	mutex_destroy(&local->scan_mtx);
+	mutex_destroy(&local->mtx);
+
+	if (local->wiphy_ciphers_allocated)
+		kfree(local->hw.wiphy->cipher_suites);
 
 	wiphy_free(local->hw.wiphy);
 }
@@ -704,6 +934,10 @@ static int __init ieee80211_init(void)
 	if (ret)
 		return ret;
 
+	ret = rc80211_minstrel_ht_init();
+	if (ret)
+		goto err_minstrel;
+
 	ret = rc80211_pid_init();
 	if (ret)
 		goto err_pid;
@@ -716,6 +950,8 @@ static int __init ieee80211_init(void)
  err_netdev:
 	rc80211_pid_exit();
  err_pid:
+	rc80211_minstrel_ht_exit();
+ err_minstrel:
 	rc80211_minstrel_exit();
 
 	return ret;
@@ -724,14 +960,9 @@ static int __init ieee80211_init(void)
 static void __exit ieee80211_exit(void)
 {
 	rc80211_pid_exit();
+	rc80211_minstrel_ht_exit();
 	rc80211_minstrel_exit();
 
-	/*
-	 * For key todo, it'll be empty by now but the work
-	 * might still be scheduled.
-	 */
-	flush_scheduled_work();
-
 	if (mesh_allocated)
 		ieee80211s_stop();
 
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index bde81031727..c8a4f19ed13 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -54,7 +54,7 @@ static void ieee80211_mesh_housekeeping_timer(unsigned long data)
 		return;
 	}
 
-	ieee80211_queue_work(&local->hw, &ifmsh->work);
+	ieee80211_queue_work(&local->hw, &sdata->work);
 }
 
 /**
@@ -345,7 +345,7 @@ static void ieee80211_mesh_path_timer(unsigned long data)
 		return;
 	}
 
-	ieee80211_queue_work(&local->hw, &ifmsh->work);
+	ieee80211_queue_work(&local->hw, &sdata->work);
 }
 
 static void ieee80211_mesh_path_root_timer(unsigned long data)
@@ -362,7 +362,7 @@ static void ieee80211_mesh_path_root_timer(unsigned long data)
 		return;
 	}
 
-	ieee80211_queue_work(&local->hw, &ifmsh->work);
+	ieee80211_queue_work(&local->hw, &sdata->work);
 }
 
 void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh)
@@ -484,9 +484,6 @@ void ieee80211_mesh_quiesce(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 
-	/* might restart the timer but that doesn't matter */
-	cancel_work_sync(&ifmsh->work);
-
 	/* use atomic bitops in case both timers fire at the same time */
 
 	if (del_timer_sync(&ifmsh->housekeeping_timer))
@@ -518,7 +515,7 @@ void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
 
 	set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags);
 	ieee80211_mesh_root_setup(ifmsh);
-	ieee80211_queue_work(&local->hw, &ifmsh->work);
+	ieee80211_queue_work(&local->hw, &sdata->work);
 	sdata->vif.bss_conf.beacon_int = MESH_DEFAULT_BEACON_INTERVAL;
 	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON |
 						BSS_CHANGED_BEACON_ENABLED |
@@ -536,16 +533,7 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata)
 	 * whether the interface is running, which, at this point,
 	 * it no longer is.
 	 */
-	cancel_work_sync(&sdata->u.mesh.work);
-
-	/*
-	 * When we get here, the interface is marked down.
-	 * Call synchronize_rcu() to wait for the RX path
-	 * should it be using the interface and enqueuing
-	 * frames at this very time on another CPU.
-	 */
-	rcu_barrier(); /* Wait for RX path and call_rcu()'s */
-	skb_queue_purge(&sdata->u.mesh.skb_queue);
+	cancel_work_sync(&sdata->work);
 }
 
 static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
@@ -608,8 +596,8 @@ static void ieee80211_mesh_rx_mgmt_action(struct ieee80211_sub_if_data *sdata,
 	}
 }
 
-static void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
-					  struct sk_buff *skb)
+void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+				   struct sk_buff *skb)
 {
 	struct ieee80211_rx_status *rx_status;
 	struct ieee80211_if_mesh *ifmsh;
@@ -632,26 +620,11 @@ static void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 		ieee80211_mesh_rx_mgmt_action(sdata, mgmt, skb->len, rx_status);
 		break;
 	}
-
-	kfree_skb(skb);
 }
 
-static void ieee80211_mesh_work(struct work_struct *work)
+void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_sub_if_data *sdata =
-		container_of(work, struct ieee80211_sub_if_data, u.mesh.work);
-	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
-	struct sk_buff *skb;
-
-	if (!ieee80211_sdata_running(sdata))
-		return;
-
-	if (local->scanning)
-		return;
-
-	while ((skb = skb_dequeue(&ifmsh->skb_queue)))
-		ieee80211_mesh_rx_queued_mgmt(sdata, skb);
 
 	if (ifmsh->preq_queue_len &&
 	    time_after(jiffies,
@@ -678,7 +651,7 @@ void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local)
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdata, &local->interfaces, list)
 		if (ieee80211_vif_is_mesh(&sdata->vif))
-			ieee80211_queue_work(&local->hw, &sdata->u.mesh.work);
+			ieee80211_queue_work(&local->hw, &sdata->work);
 	rcu_read_unlock();
 }
 
@@ -686,11 +659,9 @@ void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
 
-	INIT_WORK(&ifmsh->work, ieee80211_mesh_work);
 	setup_timer(&ifmsh->housekeeping_timer,
 		    ieee80211_mesh_housekeeping_timer,
 		    (unsigned long) sdata);
-	skb_queue_head_init(&sdata->u.mesh.skb_queue);
 
 	ifmsh->mshcfg.dot11MeshRetryTimeout = MESH_RET_T;
 	ifmsh->mshcfg.dot11MeshConfirmTimeout = MESH_CONF_T;
@@ -731,29 +702,3 @@ void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata)
 	INIT_LIST_HEAD(&ifmsh->preq_queue.list);
 	spin_lock_init(&ifmsh->mesh_preq_queue_lock);
 }
-
-ieee80211_rx_result
-ieee80211_mesh_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
-	struct ieee80211_mgmt *mgmt;
-	u16 fc;
-
-	if (skb->len < 24)
-		return RX_DROP_MONITOR;
-
-	mgmt = (struct ieee80211_mgmt *) skb->data;
-	fc = le16_to_cpu(mgmt->frame_control);
-
-	switch (fc & IEEE80211_FCTL_STYPE) {
-	case IEEE80211_STYPE_ACTION:
-	case IEEE80211_STYPE_PROBE_RESP:
-	case IEEE80211_STYPE_BEACON:
-		skb_queue_tail(&ifmsh->skb_queue, skb);
-		ieee80211_queue_work(&local->hw, &ifmsh->work);
-		return RX_QUEUED;
-	}
-
-	return RX_CONTINUE;
-}
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index c88087f1cd0..58e74112896 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -115,7 +115,7 @@ struct mesh_path {
  * @hash_rnd: random value used for hash computations
  * @entries: number of entries in the table
  * @free_node: function to free nodes of the table
- * @copy_node: fuction to copy nodes of the table
+ * @copy_node: function to copy nodes of the table
  * @size_order: determines size of the table, there will be 2^size_order hash
  *	buckets
  * @mean_chain_len: maximum average length for the hash buckets' list, if it is
@@ -237,8 +237,6 @@ void ieee80211s_update_metric(struct ieee80211_local *local,
 		struct sta_info *stainfo, struct sk_buff *skb);
 void ieee80211s_stop(void);
 void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata);
-ieee80211_rx_result
-ieee80211_mesh_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb);
 void ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata);
 void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata);
 void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh);
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 0705018d8d1..829e08a657d 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -805,14 +805,14 @@ static void mesh_queue_preq(struct mesh_path *mpath, u8 flags)
 	spin_unlock(&ifmsh->mesh_preq_queue_lock);
 
 	if (time_after(jiffies, ifmsh->last_preq + min_preq_int_jiff(sdata)))
-		ieee80211_queue_work(&sdata->local->hw, &ifmsh->work);
+		ieee80211_queue_work(&sdata->local->hw, &sdata->work);
 
 	else if (time_before(jiffies, ifmsh->last_preq)) {
 		/* avoid long wait if did not send preqs for a long time
 		 * and jiffies wrapped around
 		 */
 		ifmsh->last_preq = jiffies - min_preq_int_jiff(sdata) - 1;
-		ieee80211_queue_work(&sdata->local->hw, &ifmsh->work);
+		ieee80211_queue_work(&sdata->local->hw, &sdata->work);
 	} else
 		mod_timer(&ifmsh->mesh_path_timer, ifmsh->last_preq +
 						min_preq_int_jiff(sdata));
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 181ffd6efd8..349e466cf08 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -315,7 +315,7 @@ int mesh_path_add(u8 *dst, struct ieee80211_sub_if_data *sdata)
 	read_unlock(&pathtbl_resize_lock);
 	if (grow) {
 		set_bit(MESH_WORK_GROW_MPATH_TABLE,  &ifmsh->wrkq_flags);
-		ieee80211_queue_work(&local->hw, &ifmsh->work);
+		ieee80211_queue_work(&local->hw, &sdata->work);
 	}
 	return 0;
 
@@ -425,7 +425,7 @@ int mpp_path_add(u8 *dst, u8 *mpp, struct ieee80211_sub_if_data *sdata)
 	read_unlock(&pathtbl_resize_lock);
 	if (grow) {
 		set_bit(MESH_WORK_GROW_MPP_TABLE,  &ifmsh->wrkq_flags);
-		ieee80211_queue_work(&local->hw, &ifmsh->work);
+		ieee80211_queue_work(&local->hw, &sdata->work);
 	}
 	return 0;
 
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 3cd5f7b5d69..1c91f0f3c30 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -65,7 +65,6 @@ void mesh_plink_inc_estab_count(struct ieee80211_sub_if_data *sdata)
 {
 	atomic_inc(&sdata->u.mesh.mshstats.estab_plinks);
 	mesh_accept_plinks_update(sdata);
-	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON);
 }
 
 static inline
@@ -73,7 +72,6 @@ void mesh_plink_dec_estab_count(struct ieee80211_sub_if_data *sdata)
 {
 	atomic_dec(&sdata->u.mesh.mshstats.estab_plinks);
 	mesh_accept_plinks_update(sdata);
-	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON);
 }
 
 /**
@@ -115,7 +113,7 @@ static struct sta_info *mesh_plink_alloc(struct ieee80211_sub_if_data *sdata,
 }
 
 /**
- * mesh_plink_deactivate - deactivate mesh peer link
+ * __mesh_plink_deactivate - deactivate mesh peer link
  *
  * @sta: mesh peer link to deactivate
  *
@@ -123,18 +121,23 @@ static struct sta_info *mesh_plink_alloc(struct ieee80211_sub_if_data *sdata,
  *
  * Locking: the caller must hold sta->lock
  */
-static void __mesh_plink_deactivate(struct sta_info *sta)
+static bool __mesh_plink_deactivate(struct sta_info *sta)
 {
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	bool deactivated = false;
 
-	if (sta->plink_state == PLINK_ESTAB)
+	if (sta->plink_state == PLINK_ESTAB) {
 		mesh_plink_dec_estab_count(sdata);
+		deactivated = true;
+	}
 	sta->plink_state = PLINK_BLOCKED;
 	mesh_path_flush_by_nexthop(sta);
+
+	return deactivated;
 }
 
 /**
- * __mesh_plink_deactivate - deactivate mesh peer link
+ * mesh_plink_deactivate - deactivate mesh peer link
  *
  * @sta: mesh peer link to deactivate
  *
@@ -142,9 +145,15 @@ static void __mesh_plink_deactivate(struct sta_info *sta)
  */
 void mesh_plink_deactivate(struct sta_info *sta)
 {
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	bool deactivated;
+
 	spin_lock_bh(&sta->lock);
-	__mesh_plink_deactivate(sta);
+	deactivated = __mesh_plink_deactivate(sta);
 	spin_unlock_bh(&sta->lock);
+
+	if (deactivated)
+		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON);
 }
 
 static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
@@ -381,10 +390,16 @@ int mesh_plink_open(struct sta_info *sta)
 
 void mesh_plink_block(struct sta_info *sta)
 {
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	bool deactivated;
+
 	spin_lock_bh(&sta->lock);
-	__mesh_plink_deactivate(sta);
+	deactivated = __mesh_plink_deactivate(sta);
 	sta->plink_state = PLINK_BLOCKED;
 	spin_unlock_bh(&sta->lock);
+
+	if (deactivated)
+		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON);
 }
 
 
@@ -397,6 +412,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 	enum plink_event event;
 	enum plink_frame_type ftype;
 	size_t baselen;
+	bool deactivated, matches_local = true;
 	u8 ie_len;
 	u8 *baseaddr;
 	__le16 plid, llid, reason;
@@ -471,6 +487,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 	/* Now we will figure out the appropriate event... */
 	event = PLINK_UNDEFINED;
 	if (ftype != PLINK_CLOSE && (!mesh_matches_local(&elems, sdata))) {
+		matches_local = false;
 		switch (ftype) {
 		case PLINK_OPEN:
 			event = OPN_RJCT;
@@ -482,7 +499,15 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 			/* avoid warning */
 			break;
 		}
-		spin_lock_bh(&sta->lock);
+	}
+
+	if (!sta && !matches_local) {
+		rcu_read_unlock();
+		reason = cpu_to_le16(MESH_CAPABILITY_POLICY_VIOLATION);
+		llid = 0;
+		mesh_plink_frame_tx(sdata, PLINK_CLOSE, mgmt->sa, llid,
+				    plid, reason);
+		return;
 	} else if (!sta) {
 		/* ftype == PLINK_OPEN */
 		u32 rates;
@@ -506,7 +531,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 		}
 		event = OPN_ACPT;
 		spin_lock_bh(&sta->lock);
-	} else {
+	} else if (matches_local) {
 		spin_lock_bh(&sta->lock);
 		switch (ftype) {
 		case PLINK_OPEN:
@@ -548,6 +573,8 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 			rcu_read_unlock();
 			return;
 		}
+	} else {
+		spin_lock_bh(&sta->lock);
 	}
 
 	mpl_dbg("Mesh plink (peer, state, llid, plid, event): %pM %s %d %d %d\n",
@@ -651,8 +678,9 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 		case CNF_ACPT:
 			del_timer(&sta->plink_timer);
 			sta->plink_state = PLINK_ESTAB;
-			mesh_plink_inc_estab_count(sdata);
 			spin_unlock_bh(&sta->lock);
+			mesh_plink_inc_estab_count(sdata);
+			ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON);
 			mpl_dbg("Mesh plink with %pM ESTABLISHED\n",
 				sta->sta.addr);
 			break;
@@ -684,8 +712,9 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 		case OPN_ACPT:
 			del_timer(&sta->plink_timer);
 			sta->plink_state = PLINK_ESTAB;
-			mesh_plink_inc_estab_count(sdata);
 			spin_unlock_bh(&sta->lock);
+			mesh_plink_inc_estab_count(sdata);
+			ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON);
 			mpl_dbg("Mesh plink with %pM ESTABLISHED\n",
 				sta->sta.addr);
 			mesh_plink_frame_tx(sdata, PLINK_CONFIRM, sta->sta.addr, llid,
@@ -702,11 +731,13 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m
 		case CLS_ACPT:
 			reason = cpu_to_le16(MESH_CLOSE_RCVD);
 			sta->reason = reason;
-			__mesh_plink_deactivate(sta);
+			deactivated = __mesh_plink_deactivate(sta);
 			sta->plink_state = PLINK_HOLDING;
 			llid = sta->llid;
 			mod_plink_timer(sta, dot11MeshHoldingTimeout(sdata));
 			spin_unlock_bh(&sta->lock);
+			if (deactivated)
+				ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON);
 			mesh_plink_frame_tx(sdata, PLINK_CLOSE, sta->sta.addr, llid,
 					    plid, reason);
 			break;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index f803f8b72a9..a3a9421555a 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -54,6 +54,12 @@
  */
 #define IEEE80211_SIGNAL_AVE_WEIGHT	3
 
+/*
+ * How many Beacon frames need to have been used in average signal strength
+ * before starting to indicate signal change events.
+ */
+#define IEEE80211_SIGNAL_AVE_MIN_COUNT	4
+
 #define TMR_RUNNING_TIMER	0
 #define TMR_RUNNING_CHANSW	1
 
@@ -86,7 +92,7 @@ enum rx_mgmt_action {
 /* utils */
 static inline void ASSERT_MGD_MTX(struct ieee80211_if_managed *ifmgd)
 {
-	WARN_ON(!mutex_is_locked(&ifmgd->mtx));
+	lockdep_assert_held(&ifmgd->mtx);
 }
 
 /*
@@ -109,7 +115,7 @@ static void run_again(struct ieee80211_if_managed *ifmgd,
 		mod_timer(&ifmgd->timer, timeout);
 }
 
-static void mod_beacon_timer(struct ieee80211_sub_if_data *sdata)
+void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata)
 {
 	if (sdata->local->hw.flags & IEEE80211_HW_BEACON_FILTER)
 		return;
@@ -118,6 +124,19 @@ static void mod_beacon_timer(struct ieee80211_sub_if_data *sdata)
 		  round_jiffies_up(jiffies + IEEE80211_BEACON_LOSS_TIME));
 }
 
+void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+	if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
+		return;
+
+	mod_timer(&sdata->u.mgd.conn_mon_timer,
+		  round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME));
+
+	ifmgd->probe_send_count = 0;
+}
+
 static int ecw2cw(int ecw)
 {
 	return (1 << ecw) - 1;
@@ -478,6 +497,39 @@ static void ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata,
 	}
 }
 
+void ieee80211_enable_dyn_ps(struct ieee80211_vif *vif)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_conf *conf = &local->hw.conf;
+
+	WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION ||
+		!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS) ||
+		(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS));
+
+	local->disable_dynamic_ps = false;
+	conf->dynamic_ps_timeout = local->dynamic_ps_user_timeout;
+}
+EXPORT_SYMBOL(ieee80211_enable_dyn_ps);
+
+void ieee80211_disable_dyn_ps(struct ieee80211_vif *vif)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_conf *conf = &local->hw.conf;
+
+	WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION ||
+		!(local->hw.flags & IEEE80211_HW_SUPPORTS_PS) ||
+		(local->hw.flags & IEEE80211_HW_SUPPORTS_DYNAMIC_PS));
+
+	local->disable_dynamic_ps = true;
+	conf->dynamic_ps_timeout = 0;
+	del_timer_sync(&local->dynamic_ps_timer);
+	ieee80211_queue_work(&local->hw,
+			     &local->dynamic_ps_enable_work);
+}
+EXPORT_SYMBOL(ieee80211_disable_dyn_ps);
+
 /* powersave */
 static void ieee80211_enable_ps(struct ieee80211_local *local,
 				struct ieee80211_sub_if_data *sdata)
@@ -553,6 +605,7 @@ void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency)
 	    found->u.mgd.associated->beacon_ies &&
 	    !(found->u.mgd.flags & (IEEE80211_STA_BEACON_POLL |
 				    IEEE80211_STA_CONNECTION_POLL))) {
+		struct ieee80211_conf *conf = &local->hw.conf;
 		s32 beaconint_us;
 
 		if (latency < 0)
@@ -561,25 +614,24 @@ void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency)
 		beaconint_us = ieee80211_tu_to_usec(
 					found->vif.bss_conf.beacon_int);
 
-		timeout = local->hw.conf.dynamic_ps_forced_timeout;
+		timeout = local->dynamic_ps_forced_timeout;
 		if (timeout < 0) {
 			/*
+			 * Go to full PSM if the user configures a very low
+			 * latency requirement.
 			 * The 2 second value is there for compatibility until
 			 * the PM_QOS_NETWORK_LATENCY is configured with real
 			 * values.
 			 */
-			if (latency == 2000000000)
-				timeout = 100;
-			else if (latency <= 50000)
-				timeout = 300;
-			else if (latency <= 100000)
-				timeout = 100;
-			else if (latency <= 500000)
-				timeout = 50;
-			else
+			if (latency > 1900000000 && latency != 2000000000)
 				timeout = 0;
+			else
+				timeout = 100;
 		}
-		local->hw.conf.dynamic_ps_timeout = timeout;
+		local->dynamic_ps_user_timeout = timeout;
+		if (!local->disable_dynamic_ps)
+			conf->dynamic_ps_timeout =
+				local->dynamic_ps_user_timeout;
 
 		if (beaconint_us > latency) {
 			local->ps_sdata = NULL;
@@ -665,10 +717,11 @@ void ieee80211_dynamic_ps_timer(unsigned long data)
 
 /* MLME */
 static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
-				     struct ieee80211_if_managed *ifmgd,
+				     struct ieee80211_sub_if_data *sdata,
 				     u8 *wmm_param, size_t wmm_param_len)
 {
 	struct ieee80211_tx_queue_params params;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	size_t left;
 	int count;
 	u8 *pos, uapsd_queues = 0;
@@ -744,21 +797,22 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
 		params.uapsd = uapsd;
 
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
-		printk(KERN_DEBUG "%s: WMM queue=%d aci=%d acm=%d aifs=%d "
-		       "cWmin=%d cWmax=%d txop=%d uapsd=%d\n",
-		       wiphy_name(local->hw.wiphy), queue, aci, acm,
-		       params.aifs, params.cw_min, params.cw_max, params.txop,
-		       params.uapsd);
+		wiphy_debug(local->hw.wiphy,
+			    "WMM queue=%d aci=%d acm=%d aifs=%d "
+			    "cWmin=%d cWmax=%d txop=%d uapsd=%d\n",
+			    queue, aci, acm,
+			    params.aifs, params.cw_min, params.cw_max,
+			    params.txop, params.uapsd);
 #endif
 		if (drv_conf_tx(local, queue, &params))
-			printk(KERN_DEBUG "%s: failed to set TX queue "
-			       "parameters for queue %d\n",
-			       wiphy_name(local->hw.wiphy), queue);
+			wiphy_debug(local->hw.wiphy,
+				    "failed to set TX queue parameters for queue %d\n",
+				    queue);
 	}
 
 	/* enable WMM or activate new settings */
-	local->hw.conf.flags |=	IEEE80211_CONF_QOS;
-	drv_config(local, IEEE80211_CONF_CHANGE_QOS);
+	sdata->vif.bss_conf.qos = true;
+	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_QOS);
 }
 
 static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
@@ -806,11 +860,12 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_bss *bss = (void *)cbss->priv;
 	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
 
 	bss_info_changed |= BSS_CHANGED_ASSOC;
 	/* set timing information */
-	sdata->vif.bss_conf.beacon_int = cbss->beacon_interval;
-	sdata->vif.bss_conf.timestamp = cbss->tsf;
+	bss_conf->beacon_int = cbss->beacon_interval;
+	bss_conf->timestamp = cbss->tsf;
 
 	bss_info_changed |= BSS_CHANGED_BEACON_INT;
 	bss_info_changed |= ieee80211_handle_bss_capability(sdata,
@@ -825,17 +880,14 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 	sdata->u.mgd.flags &= ~(IEEE80211_STA_CONNECTION_POLL |
 				IEEE80211_STA_BEACON_POLL);
 
-	/*
-	 * Always handle WMM once after association regardless
-	 * of the first value the AP uses. Setting -1 here has
-	 * that effect because the AP values is an unsigned
-	 * 4-bit value.
-	 */
-	sdata->u.mgd.wmm_last_param_set = -1;
-
 	ieee80211_led_assoc(local, 1);
 
-	sdata->vif.bss_conf.assoc = 1;
+	if (local->hw.flags & IEEE80211_HW_NEED_DTIM_PERIOD)
+		bss_conf->dtim_period = bss->dtim_period;
+	else
+		bss_conf->dtim_period = 0;
+
+	bss_conf->assoc = 1;
 	/*
 	 * For now just always ask the driver to update the basic rateset
 	 * when we have associated, we aren't checking whether it actually
@@ -848,14 +900,20 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 
 	/* Tell the driver to monitor connection quality (if supported) */
 	if ((local->hw.flags & IEEE80211_HW_SUPPORTS_CQM_RSSI) &&
-	    sdata->vif.bss_conf.cqm_rssi_thold)
+	    bss_conf->cqm_rssi_thold)
 		bss_info_changed |= BSS_CHANGED_CQM;
 
+	/* Enable ARP filtering */
+	if (bss_conf->arp_filter_enabled != sdata->arp_filter_state) {
+		bss_conf->arp_filter_enabled = sdata->arp_filter_state;
+		bss_info_changed |= BSS_CHANGED_ARP_FILTER;
+	}
+
 	ieee80211_bss_info_change_notify(sdata, bss_info_changed);
 
 	mutex_lock(&local->iflist_mtx);
 	ieee80211_recalc_ps(local, -1);
-	ieee80211_recalc_smps(local, sdata);
+	ieee80211_recalc_smps(local);
 	mutex_unlock(&local->iflist_mtx);
 
 	netif_tx_start_all_queues(sdata->dev);
@@ -863,7 +921,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 }
 
 static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
-				   bool remove_sta)
+				   bool remove_sta, bool tx)
 {
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_local *local = sdata->local;
@@ -898,13 +956,13 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	netif_tx_stop_all_queues(sdata->dev);
 	netif_carrier_off(sdata->dev);
 
-	rcu_read_lock();
+	mutex_lock(&local->sta_mtx);
 	sta = sta_info_get(sdata, bssid);
 	if (sta) {
-		set_sta_flags(sta, WLAN_STA_DISASSOC);
-		ieee80211_sta_tear_down_BA_sessions(sta);
+		set_sta_flags(sta, WLAN_STA_BLOCK_BA);
+		ieee80211_sta_tear_down_BA_sessions(sta, tx);
 	}
-	rcu_read_unlock();
+	mutex_unlock(&local->sta_mtx);
 
 	changed |= ieee80211_reset_erp_info(sdata);
 
@@ -932,12 +990,23 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 
 	ieee80211_hw_config(local, config_changed);
 
+	/* Disable ARP filtering */
+	if (sdata->vif.bss_conf.arp_filter_enabled) {
+		sdata->vif.bss_conf.arp_filter_enabled = false;
+		changed |= BSS_CHANGED_ARP_FILTER;
+	}
+
 	/* The BSSID (not really interesting) and HT changed */
 	changed |= BSS_CHANGED_BSSID | BSS_CHANGED_HT;
 	ieee80211_bss_info_change_notify(sdata, changed);
 
 	if (remove_sta)
 		sta_info_destroy_addr(sdata, bssid);
+
+	del_timer_sync(&sdata->u.mgd.conn_mon_timer);
+	del_timer_sync(&sdata->u.mgd.bcn_mon_timer);
+	del_timer_sync(&sdata->u.mgd.timer);
+	del_timer_sync(&sdata->u.mgd.chswitch_timer);
 }
 
 void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
@@ -954,21 +1023,26 @@ void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
 	if (is_multicast_ether_addr(hdr->addr1))
 		return;
 
-	if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR)
-		return;
-
-	mod_timer(&sdata->u.mgd.conn_mon_timer,
-		  round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME));
+	ieee80211_sta_reset_conn_monitor(sdata);
 }
 
 static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	const u8 *ssid;
+	u8 *dst = ifmgd->associated->bssid;
+	u8 unicast_limit = max(1, IEEE80211_MAX_PROBE_TRIES - 3);
+
+	/*
+	 * Try sending broadcast probe requests for the last three
+	 * probe requests after the first ones failed since some
+	 * buggy APs only support broadcast probe requests.
+	 */
+	if (ifmgd->probe_send_count >= unicast_limit)
+		dst = NULL;
 
 	ssid = ieee80211_bss_get_ie(ifmgd->associated, WLAN_EID_SSID);
-	ieee80211_send_probe_req(sdata, ifmgd->associated->bssid,
-				 ssid + 2, ssid[1], NULL, 0);
+	ieee80211_send_probe_req(sdata, dst, ssid + 2, ssid[1], NULL, 0);
 
 	ifmgd->probe_send_count++;
 	ifmgd->probe_timeout = jiffies + IEEE80211_PROBE_WAIT;
@@ -1050,9 +1124,12 @@ static void __ieee80211_connection_loss(struct ieee80211_sub_if_data *sdata)
 
 	printk(KERN_DEBUG "Connection to AP %pM lost.\n", bssid);
 
-	ieee80211_set_disassoc(sdata, true);
-	ieee80211_recalc_idle(local);
+	ieee80211_set_disassoc(sdata, true, true);
 	mutex_unlock(&ifmgd->mtx);
+
+	mutex_lock(&local->mtx);
+	ieee80211_recalc_idle(local);
+	mutex_unlock(&local->mtx);
 	/*
 	 * must be outside lock due to cfg80211,
 	 * but that's not a problem.
@@ -1120,8 +1197,10 @@ ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
 	printk(KERN_DEBUG "%s: deauthenticated from %pM (Reason: %u)\n",
 			sdata->name, bssid, reason_code);
 
-	ieee80211_set_disassoc(sdata, true);
+	ieee80211_set_disassoc(sdata, true, false);
+	mutex_lock(&sdata->local->mtx);
 	ieee80211_recalc_idle(sdata->local);
+	mutex_unlock(&sdata->local->mtx);
 
 	return RX_MGMT_CFG80211_DEAUTH;
 }
@@ -1150,8 +1229,10 @@ ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata,
 	printk(KERN_DEBUG "%s: disassociated from %pM (Reason: %u)\n",
 			sdata->name, mgmt->sa, reason_code);
 
-	ieee80211_set_disassoc(sdata, true);
+	ieee80211_set_disassoc(sdata, true, false);
+	mutex_lock(&sdata->local->mtx);
 	ieee80211_recalc_idle(sdata->local);
+	mutex_unlock(&sdata->local->mtx);
 	return RX_MGMT_CFG80211_DISASSOC;
 }
 
@@ -1210,7 +1291,7 @@ static bool ieee80211_assoc_success(struct ieee80211_work *wk,
 
 	rates = 0;
 	basic_rates = 0;
-	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
+	sband = local->hw.wiphy->bands[wk->chan->band];
 
 	for (i = 0; i < elems.supp_rates_len; i++) {
 		int rate = (elems.supp_rates[i] & 0x7f) * 5;
@@ -1246,11 +1327,11 @@ static bool ieee80211_assoc_success(struct ieee80211_work *wk,
 		}
 	}
 
-	sta->sta.supp_rates[local->hw.conf.channel->band] = rates;
+	sta->sta.supp_rates[wk->chan->band] = rates;
 	sdata->vif.bss_conf.basic_rates = basic_rates;
 
 	/* cf. IEEE 802.11 9.2.12 */
-	if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ &&
+	if (wk->chan->band == IEEE80211_BAND_2GHZ &&
 	    have_higher_than_11mbit)
 		sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
 	else
@@ -1278,8 +1359,16 @@ static bool ieee80211_assoc_success(struct ieee80211_work *wk,
 		return false;
 	}
 
+	/*
+	 * Always handle WMM once after association regardless
+	 * of the first value the AP uses. Setting -1 here has
+	 * that effect because the AP values is an unsigned
+	 * 4-bit value.
+	 */
+	ifmgd->wmm_last_param_set = -1;
+
 	if (elems.wmm_param)
-		ieee80211_sta_wmm_params(local, ifmgd, elems.wmm_param,
+		ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
 					 elems.wmm_param_len);
 	else
 		ieee80211_set_wmm_default(sdata);
@@ -1310,7 +1399,7 @@ static bool ieee80211_assoc_success(struct ieee80211_work *wk,
 	 * Also start the timer that will detect beacon loss.
 	 */
 	ieee80211_sta_rx_notify(sdata, (struct ieee80211_hdr *)mgmt);
-	mod_beacon_timer(sdata);
+	ieee80211_sta_reset_beacon_monitor(sdata);
 
 	return true;
 }
@@ -1413,7 +1502,7 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
 		 * we have or will be receiving any beacons or data, so let's
 		 * schedule the timers again, just in case.
 		 */
-		mod_beacon_timer(sdata);
+		ieee80211_sta_reset_beacon_monitor(sdata);
 
 		mod_timer(&ifmgd->conn_mon_timer,
 			  round_jiffies_up(jiffies +
@@ -1488,15 +1577,18 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 	ifmgd->last_beacon_signal = rx_status->signal;
 	if (ifmgd->flags & IEEE80211_STA_RESET_SIGNAL_AVE) {
 		ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE;
-		ifmgd->ave_beacon_signal = rx_status->signal;
+		ifmgd->ave_beacon_signal = rx_status->signal * 16;
 		ifmgd->last_cqm_event_signal = 0;
+		ifmgd->count_beacon_signal = 1;
 	} else {
 		ifmgd->ave_beacon_signal =
 			(IEEE80211_SIGNAL_AVE_WEIGHT * rx_status->signal * 16 +
 			 (16 - IEEE80211_SIGNAL_AVE_WEIGHT) *
 			 ifmgd->ave_beacon_signal) / 16;
+		ifmgd->count_beacon_signal++;
 	}
 	if (bss_conf->cqm_rssi_thold &&
+	    ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT &&
 	    !(local->hw.flags & IEEE80211_HW_SUPPORTS_CQM_RSSI)) {
 		int sig = ifmgd->ave_beacon_signal / 16;
 		int last_event = ifmgd->last_cqm_event_signal;
@@ -1536,7 +1628,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 	 * Push the beacon loss detection into the future since
 	 * we are processing a beacon from the AP just now.
 	 */
-	mod_beacon_timer(sdata);
+	ieee80211_sta_reset_beacon_monitor(sdata);
 
 	ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4);
 	ncrc = ieee802_11_parse_elems_crc(mgmt->u.beacon.variable,
@@ -1547,11 +1639,11 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 		directed_tim = ieee80211_check_tim(elems.tim, elems.tim_len,
 						   ifmgd->aid);
 
-	if (ncrc != ifmgd->beacon_crc) {
+	if (ncrc != ifmgd->beacon_crc || !ifmgd->beacon_crc_valid) {
 		ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems,
 				      true);
 
-		ieee80211_sta_wmm_params(local, ifmgd, elems.wmm_param,
+		ieee80211_sta_wmm_params(local, sdata, elems.wmm_param,
 					 elems.wmm_param_len);
 	}
 
@@ -1578,9 +1670,10 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 		}
 	}
 
-	if (ncrc == ifmgd->beacon_crc)
+	if (ncrc == ifmgd->beacon_crc && ifmgd->beacon_crc_valid)
 		return;
 	ifmgd->beacon_crc = ncrc;
+	ifmgd->beacon_crc_valid = true;
 
 	if (elems.erp_info && elems.erp_info_len >= 1) {
 		erp_valid = true;
@@ -1633,35 +1726,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 	ieee80211_bss_info_change_notify(sdata, changed);
 }
 
-ieee80211_rx_result ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata,
-					  struct sk_buff *skb)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_mgmt *mgmt;
-	u16 fc;
-
-	if (skb->len < 24)
-		return RX_DROP_MONITOR;
-
-	mgmt = (struct ieee80211_mgmt *) skb->data;
-	fc = le16_to_cpu(mgmt->frame_control);
-
-	switch (fc & IEEE80211_FCTL_STYPE) {
-	case IEEE80211_STYPE_PROBE_RESP:
-	case IEEE80211_STYPE_BEACON:
-	case IEEE80211_STYPE_DEAUTH:
-	case IEEE80211_STYPE_DISASSOC:
-	case IEEE80211_STYPE_ACTION:
-		skb_queue_tail(&sdata->u.mgd.skb_queue, skb);
-		ieee80211_queue_work(&local->hw, &sdata->u.mgd.work);
-		return RX_QUEUED;
-	}
-
-	return RX_DROP_MONITOR;
-}
-
-static void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
-					 struct sk_buff *skb)
+void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
+				  struct sk_buff *skb)
 {
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_rx_status *rx_status;
@@ -1693,44 +1759,6 @@ static void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 			break;
 		case IEEE80211_STYPE_ACTION:
 			switch (mgmt->u.action.category) {
-			case WLAN_CATEGORY_BACK: {
-				struct ieee80211_local *local = sdata->local;
-				int len = skb->len;
-				struct sta_info *sta;
-
-				rcu_read_lock();
-				sta = sta_info_get(sdata, mgmt->sa);
-				if (!sta) {
-					rcu_read_unlock();
-					break;
-				}
-
-				local_bh_disable();
-
-				switch (mgmt->u.action.u.addba_req.action_code) {
-				case WLAN_ACTION_ADDBA_REQ:
-					if (len < (IEEE80211_MIN_ACTION_SIZE +
-						   sizeof(mgmt->u.action.u.addba_req)))
-						break;
-					ieee80211_process_addba_request(local, sta, mgmt, len);
-					break;
-				case WLAN_ACTION_ADDBA_RESP:
-					if (len < (IEEE80211_MIN_ACTION_SIZE +
-						   sizeof(mgmt->u.action.u.addba_resp)))
-						break;
-					ieee80211_process_addba_resp(local, sta, mgmt, len);
-					break;
-				case WLAN_ACTION_DELBA:
-					if (len < (IEEE80211_MIN_ACTION_SIZE +
-						   sizeof(mgmt->u.action.u.delba)))
-						break;
-					ieee80211_process_delba(sdata, sta, mgmt, len);
-					break;
-				}
-				local_bh_enable();
-				rcu_read_unlock();
-				break;
-				}
 			case WLAN_CATEGORY_SPECTRUM_MGMT:
 				ieee80211_sta_process_chanswitch(sdata,
 						&mgmt->u.action.u.chan_switch.sw_elem,
@@ -1754,7 +1782,7 @@ static void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 		default:
 			WARN(1, "unexpected: %d", rma);
 		}
-		goto out;
+		return;
 	}
 
 	mutex_unlock(&ifmgd->mtx);
@@ -1764,12 +1792,13 @@ static void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 		struct ieee80211_local *local = sdata->local;
 		struct ieee80211_work *wk;
 
-		mutex_lock(&local->work_mtx);
+		mutex_lock(&local->mtx);
 		list_for_each_entry(wk, &local->work_list, list) {
 			if (wk->sdata != sdata)
 				continue;
 
-			if (wk->type != IEEE80211_WORK_ASSOC)
+			if (wk->type != IEEE80211_WORK_ASSOC &&
+			    wk->type != IEEE80211_WORK_ASSOC_BEACON_WAIT)
 				continue;
 
 			if (memcmp(mgmt->bssid, wk->filter_ta, ETH_ALEN))
@@ -1795,12 +1824,10 @@ static void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 			free_work(wk);
 			break;
 		}
-		mutex_unlock(&local->work_mtx);
+		mutex_unlock(&local->mtx);
 
 		cfg80211_send_deauth(sdata->dev, (u8 *)mgmt, skb->len);
 	}
- out:
-	kfree_skb(skb);
 }
 
 static void ieee80211_sta_timer(unsigned long data)
@@ -1815,39 +1842,13 @@ static void ieee80211_sta_timer(unsigned long data)
 		return;
 	}
 
-	ieee80211_queue_work(&local->hw, &ifmgd->work);
+	ieee80211_queue_work(&local->hw, &sdata->work);
 }
 
-static void ieee80211_sta_work(struct work_struct *work)
+void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_sub_if_data *sdata =
-		container_of(work, struct ieee80211_sub_if_data, u.mgd.work);
 	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_managed *ifmgd;
-	struct sk_buff *skb;
-
-	if (!ieee80211_sdata_running(sdata))
-		return;
-
-	if (local->scanning)
-		return;
-
-	if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
-		return;
-
-	/*
-	 * ieee80211_queue_work() should have picked up most cases,
-	 * here we'll pick the the rest.
-	 */
-	if (WARN(local->suspended, "STA MLME work scheduled while "
-		 "going to suspend\n"))
-		return;
-
-	ifmgd = &sdata->u.mgd;
-
-	/* first process frames to avoid timing out while a frame is pending */
-	while ((skb = skb_dequeue(&ifmgd->skb_queue)))
-		ieee80211_sta_rx_queued_mgmt(sdata, skb);
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
 	/* then process the rest of the work */
 	mutex_lock(&ifmgd->mtx);
@@ -1863,10 +1864,12 @@ static void ieee80211_sta_work(struct work_struct *work)
 
 		else if (ifmgd->probe_send_count < IEEE80211_MAX_PROBE_TRIES) {
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
-			printk(KERN_DEBUG "No probe response from AP %pM"
-				" after %dms, try %d\n", bssid,
-				(1000 * IEEE80211_PROBE_WAIT)/HZ,
-				ifmgd->probe_send_count);
+			wiphy_debug(local->hw.wiphy,
+				    "%s: No probe response from AP %pM"
+				    " after %dms, try %d\n",
+				    sdata->name,
+				    bssid, (1000 * IEEE80211_PROBE_WAIT)/HZ,
+				    ifmgd->probe_send_count);
 #endif
 			ieee80211_mgd_probe_ap_send(sdata);
 		} else {
@@ -1876,12 +1879,16 @@ static void ieee80211_sta_work(struct work_struct *work)
 			 */
 			ifmgd->flags &= ~(IEEE80211_STA_CONNECTION_POLL |
 					  IEEE80211_STA_BEACON_POLL);
-			printk(KERN_DEBUG "No probe response from AP %pM"
-				" after %dms, disconnecting.\n",
-				bssid, (1000 * IEEE80211_PROBE_WAIT)/HZ);
-			ieee80211_set_disassoc(sdata, true);
-			ieee80211_recalc_idle(local);
+			wiphy_debug(local->hw.wiphy,
+				    "%s: No probe response from AP %pM"
+				    " after %dms, disconnecting.\n",
+				    sdata->name,
+				    bssid, (1000 * IEEE80211_PROBE_WAIT)/HZ);
+			ieee80211_set_disassoc(sdata, true, true);
 			mutex_unlock(&ifmgd->mtx);
+			mutex_lock(&local->mtx);
+			ieee80211_recalc_idle(local);
+			mutex_unlock(&local->mtx);
 			/*
 			 * must be outside lock due to cfg80211,
 			 * but that's not a problem.
@@ -1942,8 +1949,7 @@ static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
 		ieee80211_queue_work(&sdata->local->hw,
 			   &sdata->u.mgd.monitor_work);
 		/* and do all the other regular work too */
-		ieee80211_queue_work(&sdata->local->hw,
-			   &sdata->u.mgd.work);
+		ieee80211_queue_work(&sdata->local->hw, &sdata->work);
 	}
 }
 
@@ -1958,7 +1964,8 @@ void ieee80211_sta_quiesce(struct ieee80211_sub_if_data *sdata)
 	 * time -- the code here is properly synchronised.
 	 */
 
-	cancel_work_sync(&ifmgd->work);
+	cancel_work_sync(&ifmgd->request_smps_work);
+
 	cancel_work_sync(&ifmgd->beacon_connection_loss_work);
 	if (del_timer_sync(&ifmgd->timer))
 		set_bit(TMR_RUNNING_TIMER, &ifmgd->timers_running);
@@ -1990,11 +1997,11 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
 	struct ieee80211_if_managed *ifmgd;
 
 	ifmgd = &sdata->u.mgd;
-	INIT_WORK(&ifmgd->work, ieee80211_sta_work);
 	INIT_WORK(&ifmgd->monitor_work, ieee80211_sta_monitor_work);
 	INIT_WORK(&ifmgd->chswitch_work, ieee80211_chswitch_work);
 	INIT_WORK(&ifmgd->beacon_connection_loss_work,
 		  ieee80211_beacon_connection_loss_work);
+	INIT_WORK(&ifmgd->request_smps_work, ieee80211_request_smps_work);
 	setup_timer(&ifmgd->timer, ieee80211_sta_timer,
 		    (unsigned long) sdata);
 	setup_timer(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer,
@@ -2003,7 +2010,6 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
 		    (unsigned long) sdata);
 	setup_timer(&ifmgd->chswitch_timer, ieee80211_chswitch_timer,
 		    (unsigned long) sdata);
-	skb_queue_head_init(&ifmgd->skb_queue);
 
 	ifmgd->flags = 0;
 
@@ -2081,6 +2087,8 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
 		auth_alg = WLAN_AUTH_OPEN;
 		break;
 	case NL80211_AUTHTYPE_SHARED_KEY:
+		if (IS_ERR(sdata->local->wep_tx_tfm))
+			return -EOPNOTSUPP;
 		auth_alg = WLAN_AUTH_SHARED_KEY;
 		break;
 	case NL80211_AUTHTYPE_FT:
@@ -2134,6 +2142,8 @@ static enum work_done_result ieee80211_assoc_done(struct ieee80211_work *wk,
 						  struct sk_buff *skb)
 {
 	struct ieee80211_mgmt *mgmt;
+	struct ieee80211_rx_status *rx_status;
+	struct ieee802_11_elems elems;
 	u16 status;
 
 	if (!skb) {
@@ -2141,6 +2151,19 @@ static enum work_done_result ieee80211_assoc_done(struct ieee80211_work *wk,
 		return WORK_DONE_DESTROY;
 	}
 
+	if (wk->type == IEEE80211_WORK_ASSOC_BEACON_WAIT) {
+		mutex_lock(&wk->sdata->u.mgd.mtx);
+		rx_status = (void *) skb->cb;
+		ieee802_11_parse_elems(skb->data + 24 + 12, skb->len - 24 - 12, &elems);
+		ieee80211_rx_bss_info(wk->sdata, (void *)skb->data, skb->len, rx_status,
+				      &elems, true);
+		mutex_unlock(&wk->sdata->u.mgd.mtx);
+
+		wk->type = IEEE80211_WORK_ASSOC;
+		/* not really done yet */
+		return WORK_DONE_REQUEUE;
+	}
+
 	mgmt = (void *)skb->data;
 	status = le16_to_cpu(mgmt->u.assoc_resp.status_code);
 
@@ -2153,6 +2176,7 @@ static enum work_done_result ieee80211_assoc_done(struct ieee80211_work *wk,
 						    wk->filter_ta);
 			return WORK_DONE_DESTROY;
 		}
+
 		mutex_unlock(&wk->sdata->u.mgd.mtx);
 	}
 
@@ -2184,7 +2208,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 		}
 
 		/* Trying to reassociate - clear previous association state */
-		ieee80211_set_disassoc(sdata, true);
+		ieee80211_set_disassoc(sdata, true, false);
 	}
 	mutex_unlock(&ifmgd->mtx);
 
@@ -2195,6 +2219,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 	ifmgd->flags &= ~IEEE80211_STA_DISABLE_11N;
 	ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED;
 
+	ifmgd->beacon_crc_valid = false;
+
 	for (i = 0; i < req->crypto.n_ciphers_pairwise; i++)
 		if (req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP40 ||
 		    req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_TKIP ||
@@ -2253,10 +2279,14 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 	if (req->prev_bssid)
 		memcpy(wk->assoc.prev_bssid, req->prev_bssid, ETH_ALEN);
 
-	wk->type = IEEE80211_WORK_ASSOC;
 	wk->chan = req->bss->channel;
 	wk->sdata = sdata;
 	wk->done = ieee80211_assoc_done;
+	if (!bss->dtim_period &&
+	    sdata->local->hw.flags & IEEE80211_HW_NEED_DTIM_PERIOD)
+		wk->type = IEEE80211_WORK_ASSOC_BEACON_WAIT;
+	else
+		wk->type = IEEE80211_WORK_ASSOC;
 
 	if (req->use_mfp) {
 		ifmgd->mfp = IEEE80211_MFP_REQUIRED;
@@ -2271,6 +2301,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 	else
 		ifmgd->flags &= ~IEEE80211_STA_CONTROL_PORT;
 
+	sdata->control_port_protocol = req->crypto.control_port_ethertype;
+	sdata->control_port_no_encrypt = req->crypto.control_port_no_encrypt;
+
 	ieee80211_add_work(wk);
 	return 0;
 }
@@ -2282,27 +2315,30 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct ieee80211_work *wk;
-	const u8 *bssid = req->bss->bssid;
+	u8 bssid[ETH_ALEN];
+	bool assoc_bss = false;
 
 	mutex_lock(&ifmgd->mtx);
 
+	memcpy(bssid, req->bss->bssid, ETH_ALEN);
 	if (ifmgd->associated == req->bss) {
-		bssid = req->bss->bssid;
-		ieee80211_set_disassoc(sdata, true);
+		ieee80211_set_disassoc(sdata, false, true);
 		mutex_unlock(&ifmgd->mtx);
+		assoc_bss = true;
 	} else {
 		bool not_auth_yet = false;
 
 		mutex_unlock(&ifmgd->mtx);
 
-		mutex_lock(&local->work_mtx);
+		mutex_lock(&local->mtx);
 		list_for_each_entry(wk, &local->work_list, list) {
 			if (wk->sdata != sdata)
 				continue;
 
 			if (wk->type != IEEE80211_WORK_DIRECT_PROBE &&
 			    wk->type != IEEE80211_WORK_AUTH &&
-			    wk->type != IEEE80211_WORK_ASSOC)
+			    wk->type != IEEE80211_WORK_ASSOC &&
+			    wk->type != IEEE80211_WORK_ASSOC_BEACON_WAIT)
 				continue;
 
 			if (memcmp(req->bss->bssid, wk->filter_ta, ETH_ALEN))
@@ -2313,7 +2349,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
 			free_work(wk);
 			break;
 		}
-		mutex_unlock(&local->work_mtx);
+		mutex_unlock(&local->mtx);
 
 		/*
 		 * If somebody requests authentication and we haven't
@@ -2335,8 +2371,12 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
 	ieee80211_send_deauth_disassoc(sdata, bssid, IEEE80211_STYPE_DEAUTH,
 				       req->reason_code, cookie,
 				       !req->local_state_change);
+	if (assoc_bss)
+		sta_info_destroy_addr(sdata, bssid);
 
+	mutex_lock(&sdata->local->mtx);
 	ieee80211_recalc_idle(sdata->local);
+	mutex_unlock(&sdata->local->mtx);
 
 	return 0;
 }
@@ -2365,7 +2405,7 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata,
 	       sdata->name, req->bss->bssid, req->reason_code);
 
 	memcpy(bssid, req->bss->bssid, ETH_ALEN);
-	ieee80211_set_disassoc(sdata, false);
+	ieee80211_set_disassoc(sdata, false, true);
 
 	mutex_unlock(&ifmgd->mtx);
 
@@ -2374,46 +2414,13 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata,
 			cookie, !req->local_state_change);
 	sta_info_destroy_addr(sdata, bssid);
 
+	mutex_lock(&sdata->local->mtx);
 	ieee80211_recalc_idle(sdata->local);
+	mutex_unlock(&sdata->local->mtx);
 
 	return 0;
 }
 
-int ieee80211_mgd_action(struct ieee80211_sub_if_data *sdata,
-			 struct ieee80211_channel *chan,
-			 enum nl80211_channel_type channel_type,
-			 const u8 *buf, size_t len, u64 *cookie)
-{
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
-	struct sk_buff *skb;
-
-	/* Check that we are on the requested channel for transmission */
-	if ((chan != local->tmp_channel ||
-	     channel_type != local->tmp_channel_type) &&
-	    (chan != local->oper_channel ||
-	     channel_type != local->_oper_channel_type))
-		return -EBUSY;
-
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom + len);
-	if (!skb)
-		return -ENOMEM;
-	skb_reserve(skb, local->hw.extra_tx_headroom);
-
-	memcpy(skb_put(skb, len), buf, len);
-
-	if (!(ifmgd->flags & IEEE80211_STA_MFP_ENABLED))
-		IEEE80211_SKB_CB(skb)->flags |=
-			IEEE80211_TX_INTFL_DONT_ENCRYPT;
-	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_NL80211_FRAME_TX |
-		IEEE80211_TX_CTL_REQ_TX_STATUS;
-	skb->dev = sdata->dev;
-	ieee80211_tx_skb(sdata, skb);
-
-	*cookie = (unsigned long) skb;
-	return 0;
-}
-
 void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif,
 			       enum nl80211_cqm_rssi_threshold_event rssi_event,
 			       gfp_t gfp)
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index c36b1911987..4b564091e51 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -22,12 +22,16 @@
 static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata)
 {
 	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
 	local->offchannel_ps_enabled = false;
 
 	/* FIXME: what to do when local->pspolling is true? */
 
 	del_timer_sync(&local->dynamic_ps_timer);
+	del_timer_sync(&ifmgd->bcn_mon_timer);
+	del_timer_sync(&ifmgd->conn_mon_timer);
+
 	cancel_work_sync(&local->dynamic_ps_enable_work);
 
 	if (local->hw.conf.flags & IEEE80211_CONF_PS) {
@@ -85,6 +89,9 @@ static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata)
 		mod_timer(&local->dynamic_ps_timer, jiffies +
 			  msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout));
 	}
+
+	ieee80211_sta_reset_beacon_monitor(sdata);
+	ieee80211_sta_reset_conn_monitor(sdata);
 }
 
 void ieee80211_offchannel_stop_beaconing(struct ieee80211_local *local)
@@ -112,8 +119,10 @@ void ieee80211_offchannel_stop_beaconing(struct ieee80211_local *local)
 		 * used from user space controlled off-channel operations.
 		 */
 		if (sdata->vif.type != NL80211_IFTYPE_STATION &&
-		    sdata->vif.type != NL80211_IFTYPE_MONITOR)
+		    sdata->vif.type != NL80211_IFTYPE_MONITOR) {
+			set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state);
 			netif_tx_stop_all_queues(sdata->dev);
+		}
 	}
 	mutex_unlock(&local->iflist_mtx);
 }
@@ -131,6 +140,7 @@ void ieee80211_offchannel_stop_station(struct ieee80211_local *local)
 			continue;
 
 		if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+			set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state);
 			netif_tx_stop_all_queues(sdata->dev);
 			if (sdata->u.mgd.associated)
 				ieee80211_offchannel_ps_enable(sdata);
@@ -155,8 +165,20 @@ void ieee80211_offchannel_return(struct ieee80211_local *local,
 				ieee80211_offchannel_ps_disable(sdata);
 		}
 
-		if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
+		if (sdata->vif.type != NL80211_IFTYPE_MONITOR) {
+			clear_bit(SDATA_STATE_OFFCHANNEL, &sdata->state);
+			/*
+			 * This may wake up queues even though the driver
+			 * currently has them stopped. This is not very
+			 * likely, since the driver won't have gotten any
+			 * (or hardly any) new packets while we weren't
+			 * on the right channel, and even if it happens
+			 * it will at most lead to queueing up one more
+			 * packet per queue in mac80211 rather than on
+			 * the interface qdisc.
+			 */
 			netif_tx_wake_all_queues(sdata->dev);
+		}
 
 		/* re-enable beaconing */
 		if (enable_beaconing &&
diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c
index 75202b295a4..e37355193ed 100644
--- a/net/mac80211/pm.c
+++ b/net/mac80211/pm.c
@@ -40,22 +40,14 @@ int __ieee80211_suspend(struct ieee80211_hw *hw)
 	list_for_each_entry(sdata, &local->interfaces, list)
 		ieee80211_disable_keys(sdata);
 
-	/* Tear down aggregation sessions */
-
-	rcu_read_lock();
-
-	if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) {
-		list_for_each_entry_rcu(sta, &local->sta_list, list) {
+	/* tear down aggregation sessions and remove STAs */
+	mutex_lock(&local->sta_mtx);
+	list_for_each_entry(sta, &local->sta_list, list) {
+		if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) {
 			set_sta_flags(sta, WLAN_STA_BLOCK_BA);
-			ieee80211_sta_tear_down_BA_sessions(sta);
+			ieee80211_sta_tear_down_BA_sessions(sta, true);
 		}
-	}
 
-	rcu_read_unlock();
-
-	/* remove STAs */
-	mutex_lock(&local->sta_mtx);
-	list_for_each_entry(sta, &local->sta_list, list) {
 		if (sta->uploaded) {
 			sdata = sta->sdata;
 			if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
@@ -72,6 +64,8 @@ int __ieee80211_suspend(struct ieee80211_hw *hw)
 
 	/* remove all interfaces */
 	list_for_each_entry(sdata, &local->interfaces, list) {
+		cancel_work_sync(&sdata->work);
+
 		switch(sdata->vif.type) {
 		case NL80211_IFTYPE_STATION:
 			ieee80211_sta_quiesce(sdata);
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index 6d0bd198af1..33f76993da0 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -103,6 +103,7 @@ ieee80211_rate_control_ops_get(const char *name)
 	struct rate_control_ops *ops;
 	const char *alg_name;
 
+	kparam_block_sysfs_write(ieee80211_default_rc_algo);
 	if (!name)
 		alg_name = ieee80211_default_rc_algo;
 	else
@@ -120,6 +121,7 @@ ieee80211_rate_control_ops_get(const char *name)
 	/* try built-in one if specific alg requested but not found */
 	if (!ops && strlen(CONFIG_MAC80211_RC_DEFAULT))
 		ops = ieee80211_try_rate_control_ops_get(CONFIG_MAC80211_RC_DEFAULT);
+	kparam_unblock_sysfs_write(ieee80211_default_rc_algo);
 
 	return ops;
 }
@@ -143,6 +145,7 @@ static ssize_t rcname_read(struct file *file, char __user *userbuf,
 static const struct file_operations rcname_ops = {
 	.read = rcname_read,
 	.open = mac80211_open_file_generic,
+	.llseek = default_llseek,
 };
 #endif
 
@@ -205,7 +208,7 @@ static bool rc_no_data_or_no_ack(struct ieee80211_tx_rate_control *txrc)
 
 	fc = hdr->frame_control;
 
-	return ((info->flags & IEEE80211_TX_CTL_NO_ACK) || !ieee80211_is_data(fc));
+	return (info->flags & IEEE80211_TX_CTL_NO_ACK) || !ieee80211_is_data(fc);
 }
 
 static void rc_send_low_broadcast(s8 *idx, u32 basic_rates, u8 max_rate_idx)
@@ -326,6 +329,9 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata,
 		 * if needed.
 		 */
 		for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
+			/* Skip invalid rates */
+			if (info->control.rates[i].idx < 0)
+				break;
 			/* Rate masking supports only legacy rates for now */
 			if (info->control.rates[i].flags & IEEE80211_TX_RC_MCS)
 				continue;
@@ -366,8 +372,8 @@ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local,
 
 	ref = rate_control_alloc(name, local);
 	if (!ref) {
-		printk(KERN_WARNING "%s: Failed to select rate control "
-		       "algorithm\n", wiphy_name(local->hw.wiphy));
+		wiphy_warn(local->hw.wiphy,
+			   "Failed to select rate control algorithm\n");
 		return -ENOENT;
 	}
 
@@ -378,9 +384,8 @@ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local,
 		sta_info_flush(local, NULL);
 	}
 
-	printk(KERN_DEBUG "%s: Selected rate control "
-	       "algorithm '%s'\n", wiphy_name(local->hw.wiphy),
-	       ref->ops->name);
+	wiphy_debug(local->hw.wiphy, "Selected rate control algorithm '%s'\n",
+		    ref->ops->name);
 
 	return 0;
 }
diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h
index 065a96190e3..168427b0ffd 100644
--- a/net/mac80211/rate.h
+++ b/net/mac80211/rate.h
@@ -147,5 +147,18 @@ static inline void rc80211_minstrel_exit(void)
 }
 #endif
 
+#ifdef CONFIG_MAC80211_RC_MINSTREL_HT
+extern int rc80211_minstrel_ht_init(void);
+extern void rc80211_minstrel_ht_exit(void);
+#else
+static inline int rc80211_minstrel_ht_init(void)
+{
+	return 0;
+}
+static inline void rc80211_minstrel_ht_exit(void)
+{
+}
+#endif
+
 
 #endif /* IEEE80211_RATE_H */
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index f65ce6dcc8e..778c604d793 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -67,7 +67,6 @@ rix_to_ndx(struct minstrel_sta_info *mi, int rix)
 	for (i = rix; i >= 0; i--)
 		if (mi->r[i].rix == rix)
 			break;
-	WARN_ON(i < 0);
 	return i;
 }
 
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
index 241e76f3fdf..a290ad231d7 100644
--- a/net/mac80211/rc80211_minstrel_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -122,6 +122,7 @@ static const struct file_operations minstrel_stat_fops = {
 	.open = minstrel_stats_open,
 	.read = minstrel_stats_read,
 	.release = minstrel_stats_release,
+	.llseek = default_llseek,
 };
 
 void
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
new file mode 100644
index 00000000000..2a18d6602d4
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -0,0 +1,828 @@
+/*
+ * Copyright (C) 2010 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/debugfs.h>
+#include <linux/random.h>
+#include <linux/ieee80211.h>
+#include <net/mac80211.h>
+#include "rate.h"
+#include "rc80211_minstrel.h"
+#include "rc80211_minstrel_ht.h"
+
+#define AVG_PKT_SIZE	1200
+#define SAMPLE_COLUMNS	10
+#define EWMA_LEVEL		75
+
+/* Number of bits for an average sized packet */
+#define MCS_NBITS (AVG_PKT_SIZE << 3)
+
+/* Number of symbols for a packet with (bps) bits per symbol */
+#define MCS_NSYMS(bps) ((MCS_NBITS + (bps) - 1) / (bps))
+
+/* Transmission time for a packet containing (syms) symbols */
+#define MCS_SYMBOL_TIME(sgi, syms)					\
+	(sgi ?								\
+	  ((syms) * 18 + 4) / 5 :	/* syms * 3.6 us */		\
+	  (syms) << 2			/* syms * 4 us */		\
+	)
+
+/* Transmit duration for the raw data part of an average sized packet */
+#define MCS_DURATION(streams, sgi, bps) MCS_SYMBOL_TIME(sgi, MCS_NSYMS((streams) * (bps)))
+
+/* MCS rate information for an MCS group */
+#define MCS_GROUP(_streams, _sgi, _ht40) {				\
+	.streams = _streams,						\
+	.flags =							\
+		(_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) |			\
+		(_ht40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0),		\
+	.duration = {							\
+		MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26),		\
+		MCS_DURATION(_streams, _sgi, _ht40 ? 108 : 52),		\
+		MCS_DURATION(_streams, _sgi, _ht40 ? 162 : 78),		\
+		MCS_DURATION(_streams, _sgi, _ht40 ? 216 : 104),	\
+		MCS_DURATION(_streams, _sgi, _ht40 ? 324 : 156),	\
+		MCS_DURATION(_streams, _sgi, _ht40 ? 432 : 208),	\
+		MCS_DURATION(_streams, _sgi, _ht40 ? 486 : 234),	\
+		MCS_DURATION(_streams, _sgi, _ht40 ? 540 : 260)		\
+	}								\
+}
+
+/*
+ * To enable sufficiently targeted rate sampling, MCS rates are divided into
+ * groups, based on the number of streams and flags (HT40, SGI) that they
+ * use.
+ */
+const struct mcs_group minstrel_mcs_groups[] = {
+	MCS_GROUP(1, 0, 0),
+	MCS_GROUP(2, 0, 0),
+#if MINSTREL_MAX_STREAMS >= 3
+	MCS_GROUP(3, 0, 0),
+#endif
+
+	MCS_GROUP(1, 1, 0),
+	MCS_GROUP(2, 1, 0),
+#if MINSTREL_MAX_STREAMS >= 3
+	MCS_GROUP(3, 1, 0),
+#endif
+
+	MCS_GROUP(1, 0, 1),
+	MCS_GROUP(2, 0, 1),
+#if MINSTREL_MAX_STREAMS >= 3
+	MCS_GROUP(3, 0, 1),
+#endif
+
+	MCS_GROUP(1, 1, 1),
+	MCS_GROUP(2, 1, 1),
+#if MINSTREL_MAX_STREAMS >= 3
+	MCS_GROUP(3, 1, 1),
+#endif
+};
+
+static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES];
+
+/*
+ * Perform EWMA (Exponentially Weighted Moving Average) calculation
+ */
+static int
+minstrel_ewma(int old, int new, int weight)
+{
+	return (new * (100 - weight) + old * weight) / 100;
+}
+
+/*
+ * Look up an MCS group index based on mac80211 rate information
+ */
+static int
+minstrel_ht_get_group_idx(struct ieee80211_tx_rate *rate)
+{
+	int streams = (rate->idx / MCS_GROUP_RATES) + 1;
+	u32 flags = IEEE80211_TX_RC_SHORT_GI | IEEE80211_TX_RC_40_MHZ_WIDTH;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(minstrel_mcs_groups); i++) {
+		if (minstrel_mcs_groups[i].streams != streams)
+			continue;
+		if (minstrel_mcs_groups[i].flags != (rate->flags & flags))
+			continue;
+
+		return i;
+	}
+
+	WARN_ON(1);
+	return 0;
+}
+
+static inline struct minstrel_rate_stats *
+minstrel_get_ratestats(struct minstrel_ht_sta *mi, int index)
+{
+	return &mi->groups[index / MCS_GROUP_RATES].rates[index % MCS_GROUP_RATES];
+}
+
+
+/*
+ * Recalculate success probabilities and counters for a rate using EWMA
+ */
+static void
+minstrel_calc_rate_ewma(struct minstrel_priv *mp, struct minstrel_rate_stats *mr)
+{
+	if (unlikely(mr->attempts > 0)) {
+		mr->sample_skipped = 0;
+		mr->cur_prob = MINSTREL_FRAC(mr->success, mr->attempts);
+		if (!mr->att_hist)
+			mr->probability = mr->cur_prob;
+		else
+			mr->probability = minstrel_ewma(mr->probability,
+				mr->cur_prob, EWMA_LEVEL);
+		mr->att_hist += mr->attempts;
+		mr->succ_hist += mr->success;
+	} else {
+		mr->sample_skipped++;
+	}
+	mr->last_success = mr->success;
+	mr->last_attempts = mr->attempts;
+	mr->success = 0;
+	mr->attempts = 0;
+}
+
+/*
+ * Calculate throughput based on the average A-MPDU length, taking into account
+ * the expected number of retransmissions and their expected length
+ */
+static void
+minstrel_ht_calc_tp(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
+                    int group, int rate)
+{
+	struct minstrel_rate_stats *mr;
+	unsigned int usecs;
+
+	mr = &mi->groups[group].rates[rate];
+
+	if (mr->probability < MINSTREL_FRAC(1, 10)) {
+		mr->cur_tp = 0;
+		return;
+	}
+
+	usecs = mi->overhead / MINSTREL_TRUNC(mi->avg_ampdu_len);
+	usecs += minstrel_mcs_groups[group].duration[rate];
+	mr->cur_tp = MINSTREL_TRUNC((1000000 / usecs) * mr->probability);
+}
+
+/*
+ * Update rate statistics and select new primary rates
+ *
+ * Rules for rate selection:
+ *  - max_prob_rate must use only one stream, as a tradeoff between delivery
+ *    probability and throughput during strong fluctuations
+ *  - as long as the max prob rate has a probability of more than 3/4, pick
+ *    higher throughput rates, even if the probablity is a bit lower
+ */
+static void
+minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
+{
+	struct minstrel_mcs_group_data *mg;
+	struct minstrel_rate_stats *mr;
+	int cur_prob, cur_prob_tp, cur_tp, cur_tp2;
+	int group, i, index;
+
+	if (mi->ampdu_packets > 0) {
+		mi->avg_ampdu_len = minstrel_ewma(mi->avg_ampdu_len,
+			MINSTREL_FRAC(mi->ampdu_len, mi->ampdu_packets), EWMA_LEVEL);
+		mi->ampdu_len = 0;
+		mi->ampdu_packets = 0;
+	}
+
+	mi->sample_slow = 0;
+	mi->sample_count = 0;
+	mi->max_tp_rate = 0;
+	mi->max_tp_rate2 = 0;
+	mi->max_prob_rate = 0;
+
+	for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
+		cur_prob = 0;
+		cur_prob_tp = 0;
+		cur_tp = 0;
+		cur_tp2 = 0;
+
+		mg = &mi->groups[group];
+		if (!mg->supported)
+			continue;
+
+		mg->max_tp_rate = 0;
+		mg->max_tp_rate2 = 0;
+		mg->max_prob_rate = 0;
+		mi->sample_count++;
+
+		for (i = 0; i < MCS_GROUP_RATES; i++) {
+			if (!(mg->supported & BIT(i)))
+				continue;
+
+			mr = &mg->rates[i];
+			mr->retry_updated = false;
+			index = MCS_GROUP_RATES * group + i;
+			minstrel_calc_rate_ewma(mp, mr);
+			minstrel_ht_calc_tp(mp, mi, group, i);
+
+			if (!mr->cur_tp)
+				continue;
+
+			/* ignore the lowest rate of each single-stream group */
+			if (!i && minstrel_mcs_groups[group].streams == 1)
+				continue;
+
+			if ((mr->cur_tp > cur_prob_tp && mr->probability >
+			     MINSTREL_FRAC(3, 4)) || mr->probability > cur_prob) {
+				mg->max_prob_rate = index;
+				cur_prob = mr->probability;
+				cur_prob_tp = mr->cur_tp;
+			}
+
+			if (mr->cur_tp > cur_tp) {
+				swap(index, mg->max_tp_rate);
+				cur_tp = mr->cur_tp;
+				mr = minstrel_get_ratestats(mi, index);
+			}
+
+			if (index >= mg->max_tp_rate)
+				continue;
+
+			if (mr->cur_tp > cur_tp2) {
+				mg->max_tp_rate2 = index;
+				cur_tp2 = mr->cur_tp;
+			}
+		}
+	}
+
+	/* try to sample up to half of the availble rates during each interval */
+	mi->sample_count *= 4;
+
+	cur_prob = 0;
+	cur_prob_tp = 0;
+	cur_tp = 0;
+	cur_tp2 = 0;
+	for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) {
+		mg = &mi->groups[group];
+		if (!mg->supported)
+			continue;
+
+		mr = minstrel_get_ratestats(mi, mg->max_prob_rate);
+		if (cur_prob_tp < mr->cur_tp &&
+		    minstrel_mcs_groups[group].streams == 1) {
+			mi->max_prob_rate = mg->max_prob_rate;
+			cur_prob = mr->cur_prob;
+			cur_prob_tp = mr->cur_tp;
+		}
+
+		mr = minstrel_get_ratestats(mi, mg->max_tp_rate);
+		if (cur_tp < mr->cur_tp) {
+			mi->max_tp_rate = mg->max_tp_rate;
+			cur_tp = mr->cur_tp;
+		}
+
+		mr = minstrel_get_ratestats(mi, mg->max_tp_rate2);
+		if (cur_tp2 < mr->cur_tp) {
+			mi->max_tp_rate2 = mg->max_tp_rate2;
+			cur_tp2 = mr->cur_tp;
+		}
+	}
+
+	mi->stats_update = jiffies;
+}
+
+static bool
+minstrel_ht_txstat_valid(struct ieee80211_tx_rate *rate)
+{
+	if (!rate->count)
+		return false;
+
+	if (rate->idx < 0)
+		return false;
+
+	return !!(rate->flags & IEEE80211_TX_RC_MCS);
+}
+
+static void
+minstrel_next_sample_idx(struct minstrel_ht_sta *mi)
+{
+	struct minstrel_mcs_group_data *mg;
+
+	for (;;) {
+		mi->sample_group++;
+		mi->sample_group %= ARRAY_SIZE(minstrel_mcs_groups);
+		mg = &mi->groups[mi->sample_group];
+
+		if (!mg->supported)
+			continue;
+
+		if (++mg->index >= MCS_GROUP_RATES) {
+			mg->index = 0;
+			if (++mg->column >= ARRAY_SIZE(sample_table))
+				mg->column = 0;
+		}
+		break;
+	}
+}
+
+static void
+minstrel_downgrade_rate(struct minstrel_ht_sta *mi, unsigned int *idx,
+			bool primary)
+{
+	int group, orig_group;
+
+	orig_group = group = *idx / MCS_GROUP_RATES;
+	while (group > 0) {
+		group--;
+
+		if (!mi->groups[group].supported)
+			continue;
+
+		if (minstrel_mcs_groups[group].streams >
+		    minstrel_mcs_groups[orig_group].streams)
+			continue;
+
+		if (primary)
+			*idx = mi->groups[group].max_tp_rate;
+		else
+			*idx = mi->groups[group].max_tp_rate2;
+		break;
+	}
+}
+
+static void
+minstrel_aggr_check(struct minstrel_priv *mp, struct ieee80211_sta *pubsta, struct sk_buff *skb)
+{
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+	u16 tid;
+
+	if (unlikely(!ieee80211_is_data_qos(hdr->frame_control)))
+		return;
+
+	if (unlikely(skb->protocol == cpu_to_be16(ETH_P_PAE)))
+		return;
+
+	tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+	if (likely(sta->ampdu_mlme.tid_tx[tid]))
+		return;
+
+	ieee80211_start_tx_ba_session(pubsta, tid);
+}
+
+static void
+minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband,
+                      struct ieee80211_sta *sta, void *priv_sta,
+                      struct sk_buff *skb)
+{
+	struct minstrel_ht_sta_priv *msp = priv_sta;
+	struct minstrel_ht_sta *mi = &msp->ht;
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_tx_rate *ar = info->status.rates;
+	struct minstrel_rate_stats *rate, *rate2;
+	struct minstrel_priv *mp = priv;
+	bool last = false;
+	int group;
+	int i = 0;
+
+	if (!msp->is_ht)
+		return mac80211_minstrel.tx_status(priv, sband, sta, &msp->legacy, skb);
+
+	/* This packet was aggregated but doesn't carry status info */
+	if ((info->flags & IEEE80211_TX_CTL_AMPDU) &&
+	    !(info->flags & IEEE80211_TX_STAT_AMPDU))
+		return;
+
+	if (!(info->flags & IEEE80211_TX_STAT_AMPDU)) {
+		info->status.ampdu_ack_len =
+			(info->flags & IEEE80211_TX_STAT_ACK ? 1 : 0);
+		info->status.ampdu_len = 1;
+	}
+
+	mi->ampdu_packets++;
+	mi->ampdu_len += info->status.ampdu_len;
+
+	if (!mi->sample_wait && !mi->sample_tries && mi->sample_count > 0) {
+		mi->sample_wait = 4 + 2 * MINSTREL_TRUNC(mi->avg_ampdu_len);
+		mi->sample_tries = 3;
+		mi->sample_count--;
+	}
+
+	if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) {
+		mi->sample_packets += info->status.ampdu_len;
+		minstrel_next_sample_idx(mi);
+	}
+
+	for (i = 0; !last; i++) {
+		last = (i == IEEE80211_TX_MAX_RATES - 1) ||
+		       !minstrel_ht_txstat_valid(&ar[i + 1]);
+
+		if (!minstrel_ht_txstat_valid(&ar[i]))
+			break;
+
+		group = minstrel_ht_get_group_idx(&ar[i]);
+		rate = &mi->groups[group].rates[ar[i].idx % 8];
+
+		if (last)
+			rate->success += info->status.ampdu_ack_len;
+
+		rate->attempts += ar[i].count * info->status.ampdu_len;
+	}
+
+	/*
+	 * check for sudden death of spatial multiplexing,
+	 * downgrade to a lower number of streams if necessary.
+	 */
+	rate = minstrel_get_ratestats(mi, mi->max_tp_rate);
+	if (rate->attempts > 30 &&
+	    MINSTREL_FRAC(rate->success, rate->attempts) <
+	    MINSTREL_FRAC(20, 100))
+		minstrel_downgrade_rate(mi, &mi->max_tp_rate, true);
+
+	rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate2);
+	if (rate2->attempts > 30 &&
+	    MINSTREL_FRAC(rate2->success, rate2->attempts) <
+	    MINSTREL_FRAC(20, 100))
+		minstrel_downgrade_rate(mi, &mi->max_tp_rate2, false);
+
+	if (time_after(jiffies, mi->stats_update + (mp->update_interval / 2 * HZ) / 1000)) {
+		minstrel_ht_update_stats(mp, mi);
+		minstrel_aggr_check(mp, sta, skb);
+	}
+}
+
+static void
+minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
+                         int index)
+{
+	struct minstrel_rate_stats *mr;
+	const struct mcs_group *group;
+	unsigned int tx_time, tx_time_rtscts, tx_time_data;
+	unsigned int cw = mp->cw_min;
+	unsigned int t_slot = 9; /* FIXME */
+	unsigned int ampdu_len = MINSTREL_TRUNC(mi->avg_ampdu_len);
+
+	mr = minstrel_get_ratestats(mi, index);
+	if (mr->probability < MINSTREL_FRAC(1, 10)) {
+		mr->retry_count = 1;
+		mr->retry_count_rtscts = 1;
+		return;
+	}
+
+	mr->retry_count = 2;
+	mr->retry_count_rtscts = 2;
+	mr->retry_updated = true;
+
+	group = &minstrel_mcs_groups[index / MCS_GROUP_RATES];
+	tx_time_data = group->duration[index % MCS_GROUP_RATES] * ampdu_len;
+	tx_time = 2 * (t_slot + mi->overhead + tx_time_data);
+	tx_time_rtscts = 2 * (t_slot + mi->overhead_rtscts + tx_time_data);
+	do {
+		cw = (cw << 1) | 1;
+		cw = min(cw, mp->cw_max);
+		tx_time += cw + t_slot + mi->overhead;
+		tx_time_rtscts += cw + t_slot + mi->overhead_rtscts;
+		if (tx_time_rtscts < mp->segment_size)
+			mr->retry_count_rtscts++;
+	} while ((tx_time < mp->segment_size) &&
+	         (++mr->retry_count < mp->max_retry));
+}
+
+
+static void
+minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
+                     struct ieee80211_tx_rate *rate, int index,
+                     struct ieee80211_tx_rate_control *txrc,
+                     bool sample, bool rtscts)
+{
+	const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES];
+	struct minstrel_rate_stats *mr;
+
+	mr = minstrel_get_ratestats(mi, index);
+	if (!mr->retry_updated)
+		minstrel_calc_retransmit(mp, mi, index);
+
+	if (mr->probability < MINSTREL_FRAC(20, 100))
+		rate->count = 2;
+	else if (rtscts)
+		rate->count = mr->retry_count_rtscts;
+	else
+		rate->count = mr->retry_count;
+
+	rate->flags = IEEE80211_TX_RC_MCS | group->flags;
+	if (txrc->short_preamble)
+		rate->flags |= IEEE80211_TX_RC_USE_SHORT_PREAMBLE;
+	if (txrc->rts || rtscts)
+		rate->flags |= IEEE80211_TX_RC_USE_RTS_CTS;
+	rate->idx = index % MCS_GROUP_RATES + (group->streams - 1) * MCS_GROUP_RATES;
+}
+
+static inline int
+minstrel_get_duration(int index)
+{
+	const struct mcs_group *group = &minstrel_mcs_groups[index / MCS_GROUP_RATES];
+	return group->duration[index % MCS_GROUP_RATES];
+}
+
+static int
+minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi)
+{
+	struct minstrel_rate_stats *mr;
+	struct minstrel_mcs_group_data *mg;
+	int sample_idx = 0;
+
+	if (mi->sample_wait > 0) {
+		mi->sample_wait--;
+		return -1;
+	}
+
+	if (!mi->sample_tries)
+		return -1;
+
+	mi->sample_tries--;
+	mg = &mi->groups[mi->sample_group];
+	sample_idx = sample_table[mg->column][mg->index];
+	mr = &mg->rates[sample_idx];
+	sample_idx += mi->sample_group * MCS_GROUP_RATES;
+
+	/*
+	 * When not using MRR, do not sample if the probability is already
+	 * higher than 95% to avoid wasting airtime
+	 */
+	if (!mp->has_mrr && (mr->probability > MINSTREL_FRAC(95, 100)))
+		goto next;
+
+	/*
+	 * Make sure that lower rates get sampled only occasionally,
+	 * if the link is working perfectly.
+	 */
+	if (minstrel_get_duration(sample_idx) >
+	    minstrel_get_duration(mi->max_tp_rate)) {
+		if (mr->sample_skipped < 10)
+			goto next;
+
+		if (mi->sample_slow++ > 2)
+			goto next;
+	}
+
+	return sample_idx;
+
+next:
+	minstrel_next_sample_idx(mi);
+	return -1;
+}
+
+static void
+minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta,
+                     struct ieee80211_tx_rate_control *txrc)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb);
+	struct ieee80211_tx_rate *ar = info->status.rates;
+	struct minstrel_ht_sta_priv *msp = priv_sta;
+	struct minstrel_ht_sta *mi = &msp->ht;
+	struct minstrel_priv *mp = priv;
+	int sample_idx;
+
+	if (rate_control_send_low(sta, priv_sta, txrc))
+		return;
+
+	if (!msp->is_ht)
+		return mac80211_minstrel.get_rate(priv, sta, &msp->legacy, txrc);
+
+	info->flags |= mi->tx_flags;
+	sample_idx = minstrel_get_sample_rate(mp, mi);
+	if (sample_idx >= 0) {
+		minstrel_ht_set_rate(mp, mi, &ar[0], sample_idx,
+			txrc, true, false);
+		minstrel_ht_set_rate(mp, mi, &ar[1], mi->max_tp_rate,
+			txrc, false, true);
+		info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE;
+	} else {
+		minstrel_ht_set_rate(mp, mi, &ar[0], mi->max_tp_rate,
+			txrc, false, false);
+		minstrel_ht_set_rate(mp, mi, &ar[1], mi->max_tp_rate2,
+			txrc, false, true);
+	}
+	minstrel_ht_set_rate(mp, mi, &ar[2], mi->max_prob_rate, txrc, false, true);
+
+	ar[3].count = 0;
+	ar[3].idx = -1;
+
+	mi->total_packets++;
+
+	/* wraparound */
+	if (mi->total_packets == ~0) {
+		mi->total_packets = 0;
+		mi->sample_packets = 0;
+	}
+}
+
+static void
+minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband,
+                        struct ieee80211_sta *sta, void *priv_sta,
+			enum nl80211_channel_type oper_chan_type)
+{
+	struct minstrel_priv *mp = priv;
+	struct minstrel_ht_sta_priv *msp = priv_sta;
+	struct minstrel_ht_sta *mi = &msp->ht;
+	struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs;
+	struct ieee80211_local *local = hw_to_local(mp->hw);
+	u16 sta_cap = sta->ht_cap.cap;
+	int ack_dur;
+	int stbc;
+	int i;
+
+	/* fall back to the old minstrel for legacy stations */
+	if (!sta->ht_cap.ht_supported) {
+		msp->is_ht = false;
+		memset(&msp->legacy, 0, sizeof(msp->legacy));
+		msp->legacy.r = msp->ratelist;
+		msp->legacy.sample_table = msp->sample_table;
+		return mac80211_minstrel.rate_init(priv, sband, sta, &msp->legacy);
+	}
+
+	BUILD_BUG_ON(ARRAY_SIZE(minstrel_mcs_groups) !=
+		MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS);
+
+	msp->is_ht = true;
+	memset(mi, 0, sizeof(*mi));
+	mi->stats_update = jiffies;
+
+	ack_dur = ieee80211_frame_duration(local, 10, 60, 1, 1);
+	mi->overhead = ieee80211_frame_duration(local, 0, 60, 1, 1) + ack_dur;
+	mi->overhead_rtscts = mi->overhead + 2 * ack_dur;
+
+	mi->avg_ampdu_len = MINSTREL_FRAC(1, 1);
+
+	/* When using MRR, sample more on the first attempt, without delay */
+	if (mp->has_mrr) {
+		mi->sample_count = 16;
+		mi->sample_wait = 0;
+	} else {
+		mi->sample_count = 8;
+		mi->sample_wait = 8;
+	}
+	mi->sample_tries = 4;
+
+	stbc = (sta_cap & IEEE80211_HT_CAP_RX_STBC) >>
+		IEEE80211_HT_CAP_RX_STBC_SHIFT;
+	mi->tx_flags |= stbc << IEEE80211_TX_CTL_STBC_SHIFT;
+
+	if (sta_cap & IEEE80211_HT_CAP_LDPC_CODING)
+		mi->tx_flags |= IEEE80211_TX_CTL_LDPC;
+
+	if (oper_chan_type != NL80211_CHAN_HT40MINUS &&
+	    oper_chan_type != NL80211_CHAN_HT40PLUS)
+		sta_cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40;
+
+	for (i = 0; i < ARRAY_SIZE(mi->groups); i++) {
+		u16 req = 0;
+
+		mi->groups[i].supported = 0;
+		if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_SHORT_GI) {
+			if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+				req |= IEEE80211_HT_CAP_SGI_40;
+			else
+				req |= IEEE80211_HT_CAP_SGI_20;
+		}
+
+		if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+			req |= IEEE80211_HT_CAP_SUP_WIDTH_20_40;
+
+		if ((sta_cap & req) != req)
+			continue;
+
+		mi->groups[i].supported =
+			mcs->rx_mask[minstrel_mcs_groups[i].streams - 1];
+	}
+}
+
+static void
+minstrel_ht_rate_init(void *priv, struct ieee80211_supported_band *sband,
+                      struct ieee80211_sta *sta, void *priv_sta)
+{
+	struct minstrel_priv *mp = priv;
+
+	minstrel_ht_update_caps(priv, sband, sta, priv_sta, mp->hw->conf.channel_type);
+}
+
+static void
+minstrel_ht_rate_update(void *priv, struct ieee80211_supported_band *sband,
+                        struct ieee80211_sta *sta, void *priv_sta,
+                        u32 changed, enum nl80211_channel_type oper_chan_type)
+{
+	minstrel_ht_update_caps(priv, sband, sta, priv_sta, oper_chan_type);
+}
+
+static void *
+minstrel_ht_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp)
+{
+	struct ieee80211_supported_band *sband;
+	struct minstrel_ht_sta_priv *msp;
+	struct minstrel_priv *mp = priv;
+	struct ieee80211_hw *hw = mp->hw;
+	int max_rates = 0;
+	int i;
+
+	for (i = 0; i < IEEE80211_NUM_BANDS; i++) {
+		sband = hw->wiphy->bands[i];
+		if (sband && sband->n_bitrates > max_rates)
+			max_rates = sband->n_bitrates;
+	}
+
+	msp = kzalloc(sizeof(struct minstrel_ht_sta), gfp);
+	if (!msp)
+		return NULL;
+
+	msp->ratelist = kzalloc(sizeof(struct minstrel_rate) * max_rates, gfp);
+	if (!msp->ratelist)
+		goto error;
+
+	msp->sample_table = kmalloc(SAMPLE_COLUMNS * max_rates, gfp);
+	if (!msp->sample_table)
+		goto error1;
+
+	return msp;
+
+error1:
+	kfree(msp->ratelist);
+error:
+	kfree(msp);
+	return NULL;
+}
+
+static void
+minstrel_ht_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta)
+{
+	struct minstrel_ht_sta_priv *msp = priv_sta;
+
+	kfree(msp->sample_table);
+	kfree(msp->ratelist);
+	kfree(msp);
+}
+
+static void *
+minstrel_ht_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir)
+{
+	return mac80211_minstrel.alloc(hw, debugfsdir);
+}
+
+static void
+minstrel_ht_free(void *priv)
+{
+	mac80211_minstrel.free(priv);
+}
+
+static struct rate_control_ops mac80211_minstrel_ht = {
+	.name = "minstrel_ht",
+	.tx_status = minstrel_ht_tx_status,
+	.get_rate = minstrel_ht_get_rate,
+	.rate_init = minstrel_ht_rate_init,
+	.rate_update = minstrel_ht_rate_update,
+	.alloc_sta = minstrel_ht_alloc_sta,
+	.free_sta = minstrel_ht_free_sta,
+	.alloc = minstrel_ht_alloc,
+	.free = minstrel_ht_free,
+#ifdef CONFIG_MAC80211_DEBUGFS
+	.add_sta_debugfs = minstrel_ht_add_sta_debugfs,
+	.remove_sta_debugfs = minstrel_ht_remove_sta_debugfs,
+#endif
+};
+
+
+static void
+init_sample_table(void)
+{
+	int col, i, new_idx;
+	u8 rnd[MCS_GROUP_RATES];
+
+	memset(sample_table, 0xff, sizeof(sample_table));
+	for (col = 0; col < SAMPLE_COLUMNS; col++) {
+		for (i = 0; i < MCS_GROUP_RATES; i++) {
+			get_random_bytes(rnd, sizeof(rnd));
+			new_idx = (i + rnd[i]) % MCS_GROUP_RATES;
+
+			while (sample_table[col][new_idx] != 0xff)
+				new_idx = (new_idx + 1) % MCS_GROUP_RATES;
+
+			sample_table[col][new_idx] = i;
+		}
+	}
+}
+
+int __init
+rc80211_minstrel_ht_init(void)
+{
+	init_sample_table();
+	return ieee80211_rate_control_register(&mac80211_minstrel_ht);
+}
+
+void
+rc80211_minstrel_ht_exit(void)
+{
+	ieee80211_rate_control_unregister(&mac80211_minstrel_ht);
+}
diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h
new file mode 100644
index 00000000000..462d2b227ed
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel_ht.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (C) 2010 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __RC_MINSTREL_HT_H
+#define __RC_MINSTREL_HT_H
+
+/*
+ * The number of streams can be changed to 2 to reduce code
+ * size and memory footprint.
+ */
+#define MINSTREL_MAX_STREAMS	3
+#define MINSTREL_STREAM_GROUPS	4
+
+/* scaled fraction values */
+#define MINSTREL_SCALE	16
+#define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div)
+#define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE)
+
+#define MCS_GROUP_RATES	8
+
+struct mcs_group {
+	u32 flags;
+	unsigned int streams;
+	unsigned int duration[MCS_GROUP_RATES];
+};
+
+extern const struct mcs_group minstrel_mcs_groups[];
+
+struct minstrel_rate_stats {
+	/* current / last sampling period attempts/success counters */
+	unsigned int attempts, last_attempts;
+	unsigned int success, last_success;
+
+	/* total attempts/success counters */
+	u64 att_hist, succ_hist;
+
+	/* current throughput */
+	unsigned int cur_tp;
+
+	/* packet delivery probabilities */
+	unsigned int cur_prob, probability;
+
+	/* maximum retry counts */
+	unsigned int retry_count;
+	unsigned int retry_count_rtscts;
+
+	bool retry_updated;
+	u8 sample_skipped;
+};
+
+struct minstrel_mcs_group_data {
+	u8 index;
+	u8 column;
+
+	/* bitfield of supported MCS rates of this group */
+	u8 supported;
+
+	/* selected primary rates */
+	unsigned int max_tp_rate;
+	unsigned int max_tp_rate2;
+	unsigned int max_prob_rate;
+
+	/* MCS rate statistics */
+	struct minstrel_rate_stats rates[MCS_GROUP_RATES];
+};
+
+struct minstrel_ht_sta {
+	/* ampdu length (average, per sampling interval) */
+	unsigned int ampdu_len;
+	unsigned int ampdu_packets;
+
+	/* ampdu length (EWMA) */
+	unsigned int avg_ampdu_len;
+
+	/* best throughput rate */
+	unsigned int max_tp_rate;
+
+	/* second best throughput rate */
+	unsigned int max_tp_rate2;
+
+	/* best probability rate */
+	unsigned int max_prob_rate;
+
+	/* time of last status update */
+	unsigned long stats_update;
+
+	/* overhead time in usec for each frame */
+	unsigned int overhead;
+	unsigned int overhead_rtscts;
+
+	unsigned int total_packets;
+	unsigned int sample_packets;
+
+	/* tx flags to add for frames for this sta */
+	u32 tx_flags;
+
+	u8 sample_wait;
+	u8 sample_tries;
+	u8 sample_count;
+	u8 sample_slow;
+
+	/* current MCS group to be sampled */
+	u8 sample_group;
+
+	/* MCS rate group info and statistics */
+	struct minstrel_mcs_group_data groups[MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS];
+};
+
+struct minstrel_ht_sta_priv {
+	union {
+		struct minstrel_ht_sta ht;
+		struct minstrel_sta_info legacy;
+	};
+#ifdef CONFIG_MAC80211_DEBUGFS
+	struct dentry *dbg_stats;
+#endif
+	void *ratelist;
+	void *sample_table;
+	bool is_ht;
+};
+
+void minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir);
+void minstrel_ht_remove_sta_debugfs(void *priv, void *priv_sta);
+
+#endif
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
new file mode 100644
index 00000000000..cefcb5d2dae
--- /dev/null
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2010 Felix Fietkau <nbd@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/debugfs.h>
+#include <linux/ieee80211.h>
+#include <net/mac80211.h>
+#include "rc80211_minstrel.h"
+#include "rc80211_minstrel_ht.h"
+
+static int
+minstrel_ht_stats_open(struct inode *inode, struct file *file)
+{
+	struct minstrel_ht_sta_priv *msp = inode->i_private;
+	struct minstrel_ht_sta *mi = &msp->ht;
+	struct minstrel_debugfs_info *ms;
+	unsigned int i, j, tp, prob, eprob;
+	char *p;
+	int ret;
+
+	if (!msp->is_ht) {
+		inode->i_private = &msp->legacy;
+		ret = minstrel_stats_open(inode, file);
+		inode->i_private = msp;
+		return ret;
+	}
+
+	ms = kmalloc(sizeof(*ms) + 8192, GFP_KERNEL);
+	if (!ms)
+		return -ENOMEM;
+
+	file->private_data = ms;
+	p = ms->buf;
+	p += sprintf(p, "type      rate     throughput  ewma prob   this prob  "
+			"this succ/attempt   success    attempts\n");
+	for (i = 0; i < MINSTREL_MAX_STREAMS * MINSTREL_STREAM_GROUPS; i++) {
+		char htmode = '2';
+		char gimode = 'L';
+
+		if (!mi->groups[i].supported)
+			continue;
+
+		if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+			htmode = '4';
+		if (minstrel_mcs_groups[i].flags & IEEE80211_TX_RC_SHORT_GI)
+			gimode = 'S';
+
+		for (j = 0; j < MCS_GROUP_RATES; j++) {
+			struct minstrel_rate_stats *mr = &mi->groups[i].rates[j];
+			int idx = i * MCS_GROUP_RATES + j;
+
+			if (!(mi->groups[i].supported & BIT(j)))
+				continue;
+
+			p += sprintf(p, "HT%c0/%cGI ", htmode, gimode);
+
+			*(p++) = (idx == mi->max_tp_rate) ? 'T' : ' ';
+			*(p++) = (idx == mi->max_tp_rate2) ? 't' : ' ';
+			*(p++) = (idx == mi->max_prob_rate) ? 'P' : ' ';
+			p += sprintf(p, "MCS%-2u", (minstrel_mcs_groups[i].streams - 1) *
+					MCS_GROUP_RATES + j);
+
+			tp = mr->cur_tp / 10;
+			prob = MINSTREL_TRUNC(mr->cur_prob * 1000);
+			eprob = MINSTREL_TRUNC(mr->probability * 1000);
+
+			p += sprintf(p, "  %6u.%1u   %6u.%1u   %6u.%1u        "
+					"%3u(%3u)   %8llu    %8llu\n",
+					tp / 10, tp % 10,
+					eprob / 10, eprob % 10,
+					prob / 10, prob % 10,
+					mr->last_success,
+					mr->last_attempts,
+					(unsigned long long)mr->succ_hist,
+					(unsigned long long)mr->att_hist);
+		}
+	}
+	p += sprintf(p, "\nTotal packet count::    ideal %d      "
+			"lookaround %d\n",
+			max(0, (int) mi->total_packets - (int) mi->sample_packets),
+			mi->sample_packets);
+	p += sprintf(p, "Average A-MPDU length: %d.%d\n",
+		MINSTREL_TRUNC(mi->avg_ampdu_len),
+		MINSTREL_TRUNC(mi->avg_ampdu_len * 10) % 10);
+	ms->len = p - ms->buf;
+
+	return nonseekable_open(inode, file);
+}
+
+static const struct file_operations minstrel_ht_stat_fops = {
+	.owner = THIS_MODULE,
+	.open = minstrel_ht_stats_open,
+	.read = minstrel_stats_read,
+	.release = minstrel_stats_release,
+	.llseek = no_llseek,
+};
+
+void
+minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir)
+{
+	struct minstrel_ht_sta_priv *msp = priv_sta;
+
+	msp->dbg_stats = debugfs_create_file("rc_stats", S_IRUGO, dir, msp,
+			&minstrel_ht_stat_fops);
+}
+
+void
+minstrel_ht_remove_sta_debugfs(void *priv, void *priv_sta)
+{
+	struct minstrel_ht_sta_priv *msp = priv_sta;
+
+	debugfs_remove(msp->dbg_stats);
+}
diff --git a/net/mac80211/rc80211_pid_debugfs.c b/net/mac80211/rc80211_pid_debugfs.c
index 47438b4a9af..4851e9e2dae 100644
--- a/net/mac80211/rc80211_pid_debugfs.c
+++ b/net/mac80211/rc80211_pid_debugfs.c
@@ -162,7 +162,7 @@ static ssize_t rate_control_pid_events_read(struct file *file, char __user *buf,
 	file_info->next_entry = (file_info->next_entry + 1) %
 				RC_PID_EVENT_RING_SIZE;
 
-	/* Print information about the event. Note that userpace needs to
+	/* Print information about the event. Note that userspace needs to
 	 * provide large enough buffers. */
 	length = length < RC_PID_PRINT_BUF_SIZE ?
 		 length : RC_PID_PRINT_BUF_SIZE;
@@ -206,6 +206,7 @@ static const struct file_operations rc_pid_fop_events = {
 	.poll = rate_control_pid_events_poll,
 	.open = rate_control_pid_events_open,
 	.release = rate_control_pid_events_release,
+	.llseek = noop_llseek,
 };
 
 void rate_control_pid_add_sta_debugfs(void *priv, void *priv_sta,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index be9abc2e634..902b03ee8f6 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -293,7 +293,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 			skb2 = skb_clone(skb, GFP_ATOMIC);
 			if (skb2) {
 				skb2->dev = prev_dev;
-				netif_rx(skb2);
+				netif_receive_skb(skb2);
 			}
 		}
 
@@ -304,7 +304,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 
 	if (prev_dev) {
 		skb->dev = prev_dev;
-		netif_rx(skb);
+		netif_receive_skb(skb);
 	} else
 		dev_kfree_skb(skb);
 
@@ -315,6 +315,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 static void ieee80211_parse_qos(struct ieee80211_rx_data *rx)
 {
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
 	int tid;
 
 	/* does the frame have a qos control field? */
@@ -323,9 +324,7 @@ static void ieee80211_parse_qos(struct ieee80211_rx_data *rx)
 		/* frame has qos control */
 		tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
 		if (*qc & IEEE80211_QOS_CONTROL_A_MSDU_PRESENT)
-			rx->flags |= IEEE80211_RX_AMSDU;
-		else
-			rx->flags &= ~IEEE80211_RX_AMSDU;
+			status->rx_flags |= IEEE80211_RX_AMSDU;
 	} else {
 		/*
 		 * IEEE 802.11-2007, 7.1.3.4.1 ("Sequence Number field"):
@@ -387,26 +386,25 @@ static ieee80211_rx_result debug_noinline
 ieee80211_rx_h_passive_scan(struct ieee80211_rx_data *rx)
 {
 	struct ieee80211_local *local = rx->local;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
 	struct sk_buff *skb = rx->skb;
 
-	if (unlikely(test_bit(SCAN_HW_SCANNING, &local->scanning)))
+	if (likely(!(status->rx_flags & IEEE80211_RX_IN_SCAN)))
+		return RX_CONTINUE;
+
+	if (test_bit(SCAN_HW_SCANNING, &local->scanning))
 		return ieee80211_scan_rx(rx->sdata, skb);
 
-	if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning) &&
-		     (rx->flags & IEEE80211_RX_IN_SCAN))) {
+	if (test_bit(SCAN_SW_SCANNING, &local->scanning)) {
 		/* drop all the other packets during a software scan anyway */
 		if (ieee80211_scan_rx(rx->sdata, skb) != RX_QUEUED)
 			dev_kfree_skb(skb);
 		return RX_QUEUED;
 	}
 
-	if (unlikely(rx->flags & IEEE80211_RX_IN_SCAN)) {
-		/* scanning finished during invoking of handlers */
-		I802_DEBUG_INC(local->rx_handlers_drop_passive_scan);
-		return RX_DROP_UNUSABLE;
-	}
-
-	return RX_CONTINUE;
+	/* scanning finished during invoking of handlers */
+	I802_DEBUG_INC(local->rx_handlers_drop_passive_scan);
+	return RX_DROP_UNUSABLE;
 }
 
 
@@ -538,20 +536,12 @@ static void ieee80211_release_reorder_frame(struct ieee80211_hw *hw,
 					    int index,
 					    struct sk_buff_head *frames)
 {
-	struct ieee80211_supported_band *sband;
-	struct ieee80211_rate *rate = NULL;
 	struct sk_buff *skb = tid_agg_rx->reorder_buf[index];
-	struct ieee80211_rx_status *status;
 
 	if (!skb)
 		goto no_frame;
 
-	status = IEEE80211_SKB_RXCB(skb);
-
-	/* release the reordered frames to stack */
-	sband = hw->wiphy->bands[status->band];
-	if (!(status->flag & RX_FLAG_HT))
-		rate = &sband->bitrates[status->rate_idx];
+	/* release the frame from the reorder ring buffer */
 	tid_agg_rx->stored_mpdu_num--;
 	tid_agg_rx->reorder_buf[index] = NULL;
 	__skb_queue_tail(frames, skb);
@@ -580,9 +570,102 @@ static void ieee80211_release_reorder_frames(struct ieee80211_hw *hw,
  * frames that have not yet been received are assumed to be lost and the skb
  * can be released for processing. This may also release other skb's from the
  * reorder buffer if there are no additional gaps between the frames.
+ *
+ * Callers must hold tid_agg_rx->reorder_lock.
  */
 #define HT_RX_REORDER_BUF_TIMEOUT (HZ / 10)
 
+static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw,
+					  struct tid_ampdu_rx *tid_agg_rx,
+					  struct sk_buff_head *frames)
+{
+	int index, j;
+
+	/* release the buffer until next missing frame */
+	index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
+						tid_agg_rx->buf_size;
+	if (!tid_agg_rx->reorder_buf[index] &&
+	    tid_agg_rx->stored_mpdu_num > 1) {
+		/*
+		 * No buffers ready to be released, but check whether any
+		 * frames in the reorder buffer have timed out.
+		 */
+		int skipped = 1;
+		for (j = (index + 1) % tid_agg_rx->buf_size; j != index;
+		     j = (j + 1) % tid_agg_rx->buf_size) {
+			if (!tid_agg_rx->reorder_buf[j]) {
+				skipped++;
+				continue;
+			}
+			if (!time_after(jiffies, tid_agg_rx->reorder_time[j] +
+					HT_RX_REORDER_BUF_TIMEOUT))
+				goto set_release_timer;
+
+#ifdef CONFIG_MAC80211_HT_DEBUG
+			if (net_ratelimit())
+				wiphy_debug(hw->wiphy,
+					    "release an RX reorder frame due to timeout on earlier frames\n");
+#endif
+			ieee80211_release_reorder_frame(hw, tid_agg_rx,
+							j, frames);
+
+			/*
+			 * Increment the head seq# also for the skipped slots.
+			 */
+			tid_agg_rx->head_seq_num =
+				(tid_agg_rx->head_seq_num + skipped) & SEQ_MASK;
+			skipped = 0;
+		}
+	} else while (tid_agg_rx->reorder_buf[index]) {
+		ieee80211_release_reorder_frame(hw, tid_agg_rx, index, frames);
+		index =	seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
+							tid_agg_rx->buf_size;
+	}
+
+	/*
+	 * Disable the reorder release timer for now.
+	 *
+	 * The current implementation lacks a proper locking scheme
+	 * which would protect vital statistic and debug counters
+	 * from being updated by two different but concurrent BHs.
+	 *
+	 * More information about the topic is available from:
+	 * - thread: http://marc.info/?t=128635927000001
+	 *
+	 * What was wrong:
+	 * =>  http://marc.info/?l=linux-wireless&m=128636170811964
+	 * "Basically the thing is that until your patch, the data
+	 *  in the struct didn't actually need locking because it
+	 *  was accessed by the RX path only which is not concurrent."
+	 *
+	 * List of what needs to be fixed:
+	 * => http://marc.info/?l=linux-wireless&m=128656352920957
+	 *
+
+	if (tid_agg_rx->stored_mpdu_num) {
+		j = index = seq_sub(tid_agg_rx->head_seq_num,
+				    tid_agg_rx->ssn) % tid_agg_rx->buf_size;
+
+		for (; j != (index - 1) % tid_agg_rx->buf_size;
+		     j = (j + 1) % tid_agg_rx->buf_size) {
+			if (tid_agg_rx->reorder_buf[j])
+				break;
+		}
+
+ set_release_timer:
+
+		mod_timer(&tid_agg_rx->reorder_timer,
+			  tid_agg_rx->reorder_time[j] +
+			  HT_RX_REORDER_BUF_TIMEOUT);
+	} else {
+		del_timer(&tid_agg_rx->reorder_timer);
+	}
+	*/
+
+set_release_timer:
+	return;
+}
+
 /*
  * As this function belongs to the RX path it must be under
  * rcu_read_lock protection. It returns false if the frame
@@ -598,14 +681,16 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw,
 	u16 mpdu_seq_num = (sc & IEEE80211_SCTL_SEQ) >> 4;
 	u16 head_seq_num, buf_size;
 	int index;
+	bool ret = true;
 
 	buf_size = tid_agg_rx->buf_size;
 	head_seq_num = tid_agg_rx->head_seq_num;
 
+	spin_lock(&tid_agg_rx->reorder_lock);
 	/* frame with out of date sequence number */
 	if (seq_less(mpdu_seq_num, head_seq_num)) {
 		dev_kfree_skb(skb);
-		return true;
+		goto out;
 	}
 
 	/*
@@ -626,7 +711,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw,
 	/* check if we already stored this frame */
 	if (tid_agg_rx->reorder_buf[index]) {
 		dev_kfree_skb(skb);
-		return true;
+		goto out;
 	}
 
 	/*
@@ -636,58 +721,19 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw,
 	if (mpdu_seq_num == tid_agg_rx->head_seq_num &&
 	    tid_agg_rx->stored_mpdu_num == 0) {
 		tid_agg_rx->head_seq_num = seq_inc(tid_agg_rx->head_seq_num);
-		return false;
+		ret = false;
+		goto out;
 	}
 
 	/* put the frame in the reordering buffer */
 	tid_agg_rx->reorder_buf[index] = skb;
 	tid_agg_rx->reorder_time[index] = jiffies;
 	tid_agg_rx->stored_mpdu_num++;
-	/* release the buffer until next missing frame */
-	index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
-						tid_agg_rx->buf_size;
-	if (!tid_agg_rx->reorder_buf[index] &&
-	    tid_agg_rx->stored_mpdu_num > 1) {
-		/*
-		 * No buffers ready to be released, but check whether any
-		 * frames in the reorder buffer have timed out.
-		 */
-		int j;
-		int skipped = 1;
-		for (j = (index + 1) % tid_agg_rx->buf_size; j != index;
-		     j = (j + 1) % tid_agg_rx->buf_size) {
-			if (!tid_agg_rx->reorder_buf[j]) {
-				skipped++;
-				continue;
-			}
-			if (!time_after(jiffies, tid_agg_rx->reorder_time[j] +
-					HT_RX_REORDER_BUF_TIMEOUT))
-				break;
-
-#ifdef CONFIG_MAC80211_HT_DEBUG
-			if (net_ratelimit())
-				printk(KERN_DEBUG "%s: release an RX reorder "
-				       "frame due to timeout on earlier "
-				       "frames\n",
-				       wiphy_name(hw->wiphy));
-#endif
-			ieee80211_release_reorder_frame(hw, tid_agg_rx,
-							j, frames);
-
-			/*
-			 * Increment the head seq# also for the skipped slots.
-			 */
-			tid_agg_rx->head_seq_num =
-				(tid_agg_rx->head_seq_num + skipped) & SEQ_MASK;
-			skipped = 0;
-		}
-	} else while (tid_agg_rx->reorder_buf[index]) {
-		ieee80211_release_reorder_frame(hw, tid_agg_rx, index, frames);
-		index =	seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
-							tid_agg_rx->buf_size;
-	}
+	ieee80211_sta_reorder_release(hw, tid_agg_rx, frames);
 
-	return true;
+ out:
+	spin_unlock(&tid_agg_rx->reorder_lock);
+	return ret;
 }
 
 /*
@@ -719,16 +765,13 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx,
 
 	tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
 
-	spin_lock(&sta->lock);
-
-	if (!sta->ampdu_mlme.tid_active_rx[tid])
-		goto dont_reorder_unlock;
-
-	tid_agg_rx = sta->ampdu_mlme.tid_rx[tid];
+	tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
+	if (!tid_agg_rx)
+		goto dont_reorder;
 
 	/* qos null data frames are excluded */
 	if (unlikely(hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_NULLFUNC)))
-		goto dont_reorder_unlock;
+		goto dont_reorder;
 
 	/* new, potentially un-ordered, ampdu frame - process it */
 
@@ -740,20 +783,22 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx,
 	/* if this mpdu is fragmented - terminate rx aggregation session */
 	sc = le16_to_cpu(hdr->seq_ctrl);
 	if (sc & IEEE80211_SCTL_FRAG) {
-		spin_unlock(&sta->lock);
-		__ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT,
-					       WLAN_REASON_QSTA_REQUIRE_SETUP);
-		dev_kfree_skb(skb);
+		skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
+		skb_queue_tail(&rx->sdata->skb_queue, skb);
+		ieee80211_queue_work(&local->hw, &rx->sdata->work);
 		return;
 	}
 
-	if (ieee80211_sta_manage_reorder_buf(hw, tid_agg_rx, skb, frames)) {
-		spin_unlock(&sta->lock);
+	/*
+	 * No locking needed -- we will only ever process one
+	 * RX packet at a time, and thus own tid_agg_rx. All
+	 * other code manipulating it needs to (and does) make
+	 * sure that we cannot get to it any more before doing
+	 * anything with it.
+	 */
+	if (ieee80211_sta_manage_reorder_buf(hw, tid_agg_rx, skb, frames))
 		return;
-	}
 
- dont_reorder_unlock:
-	spin_unlock(&sta->lock);
  dont_reorder:
 	__skb_queue_tail(frames, skb);
 }
@@ -762,13 +807,14 @@ static ieee80211_rx_result debug_noinline
 ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
 {
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
 
 	/* Drop duplicate 802.11 retransmissions (IEEE 802.11 Chap. 9.2.9) */
 	if (rx->sta && !is_multicast_ether_addr(hdr->addr1)) {
 		if (unlikely(ieee80211_has_retry(hdr->frame_control) &&
 			     rx->sta->last_seq_ctrl[rx->queue] ==
 			     hdr->seq_ctrl)) {
-			if (rx->flags & IEEE80211_RX_RA_MATCH) {
+			if (status->rx_flags & IEEE80211_RX_RA_MATCH) {
 				rx->local->dot11FrameDuplicateCount++;
 				rx->sta->num_duplicates++;
 			}
@@ -797,11 +843,12 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
 	if (unlikely((ieee80211_is_data(hdr->frame_control) ||
 		      ieee80211_is_pspoll(hdr->frame_control)) &&
 		     rx->sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+		     rx->sdata->vif.type != NL80211_IFTYPE_WDS &&
 		     (!rx->sta || !test_sta_flags(rx->sta, WLAN_STA_ASSOC)))) {
 		if ((!ieee80211_has_fromds(hdr->frame_control) &&
 		     !ieee80211_has_tods(hdr->frame_control) &&
 		     ieee80211_is_data(hdr->frame_control)) ||
-		    !(rx->flags & IEEE80211_RX_RA_MATCH)) {
+		    !(status->rx_flags & IEEE80211_RX_RA_MATCH)) {
 			/* Drop IBSS frames and frames for other hosts
 			 * silently. */
 			return RX_DROP_MONITOR;
@@ -823,8 +870,9 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 	int keyidx;
 	int hdrlen;
 	ieee80211_rx_result result = RX_DROP_UNUSABLE;
-	struct ieee80211_key *stakey = NULL;
+	struct ieee80211_key *sta_ptk = NULL;
 	int mmie_keyidx = -1;
+	__le16 fc;
 
 	/*
 	 * Key selection 101
@@ -857,22 +905,27 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 	 * No point in finding a key and decrypting if the frame is neither
 	 * addressed to us nor a multicast frame.
 	 */
-	if (!(rx->flags & IEEE80211_RX_RA_MATCH))
+	if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
 		return RX_CONTINUE;
 
 	/* start without a key */
 	rx->key = NULL;
 
 	if (rx->sta)
-		stakey = rcu_dereference(rx->sta->key);
+		sta_ptk = rcu_dereference(rx->sta->ptk);
 
-	if (!ieee80211_has_protected(hdr->frame_control))
+	fc = hdr->frame_control;
+
+	if (!ieee80211_has_protected(fc))
 		mmie_keyidx = ieee80211_get_mmie_keyidx(rx->skb);
 
-	if (!is_multicast_ether_addr(hdr->addr1) && stakey) {
-		rx->key = stakey;
+	if (!is_multicast_ether_addr(hdr->addr1) && sta_ptk) {
+		rx->key = sta_ptk;
+		if ((status->flag & RX_FLAG_DECRYPTED) &&
+		    (status->flag & RX_FLAG_IV_STRIPPED))
+			return RX_CONTINUE;
 		/* Skip decryption if the frame is not protected. */
-		if (!ieee80211_has_protected(hdr->frame_control))
+		if (!ieee80211_has_protected(fc))
 			return RX_CONTINUE;
 	} else if (mmie_keyidx >= 0) {
 		/* Broadcast/multicast robust management frame / BIP */
@@ -883,8 +936,11 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 		if (mmie_keyidx < NUM_DEFAULT_KEYS ||
 		    mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
 			return RX_DROP_MONITOR; /* unexpected BIP keyidx */
-		rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]);
-	} else if (!ieee80211_has_protected(hdr->frame_control)) {
+		if (rx->sta)
+			rx->key = rcu_dereference(rx->sta->gtk[mmie_keyidx]);
+		if (!rx->key)
+			rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]);
+	} else if (!ieee80211_has_protected(fc)) {
 		/*
 		 * The frame was not protected, so skip decryption. However, we
 		 * need to set rx->key if there is a key that could have been
@@ -892,7 +948,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 		 * have been expected.
 		 */
 		struct ieee80211_key *key = NULL;
-		if (ieee80211_is_mgmt(hdr->frame_control) &&
+		if (ieee80211_is_mgmt(fc) &&
 		    is_multicast_ether_addr(hdr->addr1) &&
 		    (key = rcu_dereference(rx->sdata->default_mgmt_key)))
 			rx->key = key;
@@ -914,7 +970,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 		    (status->flag & RX_FLAG_IV_STRIPPED))
 			return RX_CONTINUE;
 
-		hdrlen = ieee80211_hdrlen(hdr->frame_control);
+		hdrlen = ieee80211_hdrlen(fc);
 
 		if (rx->skb->len < 8 + hdrlen)
 			return RX_DROP_UNUSABLE; /* TODO: count this? */
@@ -926,16 +982,25 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 		skb_copy_bits(rx->skb, hdrlen + 3, &keyid, 1);
 		keyidx = keyid >> 6;
 
-		rx->key = rcu_dereference(rx->sdata->keys[keyidx]);
+		/* check per-station GTK first, if multicast packet */
+		if (is_multicast_ether_addr(hdr->addr1) && rx->sta)
+			rx->key = rcu_dereference(rx->sta->gtk[keyidx]);
 
-		/*
-		 * RSNA-protected unicast frames should always be sent with
-		 * pairwise or station-to-station keys, but for WEP we allow
-		 * using a key index as well.
-		 */
-		if (rx->key && rx->key->conf.alg != ALG_WEP &&
-		    !is_multicast_ether_addr(hdr->addr1))
-			rx->key = NULL;
+		/* if not found, try default key */
+		if (!rx->key) {
+			rx->key = rcu_dereference(rx->sdata->keys[keyidx]);
+
+			/*
+			 * RSNA-protected unicast frames should always be
+			 * sent with pairwise or station-to-station keys,
+			 * but for WEP we allow using a key index as well.
+			 */
+			if (rx->key &&
+			    rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP40 &&
+			    rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP104 &&
+			    !is_multicast_ether_addr(hdr->addr1))
+				rx->key = NULL;
+		}
 	}
 
 	if (rx->key) {
@@ -947,30 +1012,35 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 
 	if (skb_linearize(rx->skb))
 		return RX_DROP_UNUSABLE;
+	/* the hdr variable is invalid now! */
+
+	switch (rx->key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_WEP40:
+	case WLAN_CIPHER_SUITE_WEP104:
+		/* Check for weak IVs if possible */
+		if (rx->sta && ieee80211_is_data(fc) &&
+		    (!(status->flag & RX_FLAG_IV_STRIPPED) ||
+		     !(status->flag & RX_FLAG_DECRYPTED)) &&
+		    ieee80211_wep_is_weak_iv(rx->skb, rx->key))
+			rx->sta->wep_weak_iv_count++;
 
-	hdr = (struct ieee80211_hdr *)rx->skb->data;
-
-	/* Check for weak IVs if possible */
-	if (rx->sta && rx->key->conf.alg == ALG_WEP &&
-	    ieee80211_is_data(hdr->frame_control) &&
-	    (!(status->flag & RX_FLAG_IV_STRIPPED) ||
-	     !(status->flag & RX_FLAG_DECRYPTED)) &&
-	    ieee80211_wep_is_weak_iv(rx->skb, rx->key))
-		rx->sta->wep_weak_iv_count++;
-
-	switch (rx->key->conf.alg) {
-	case ALG_WEP:
 		result = ieee80211_crypto_wep_decrypt(rx);
 		break;
-	case ALG_TKIP:
+	case WLAN_CIPHER_SUITE_TKIP:
 		result = ieee80211_crypto_tkip_decrypt(rx);
 		break;
-	case ALG_CCMP:
+	case WLAN_CIPHER_SUITE_CCMP:
 		result = ieee80211_crypto_ccmp_decrypt(rx);
 		break;
-	case ALG_AES_CMAC:
+	case WLAN_CIPHER_SUITE_AES_CMAC:
 		result = ieee80211_crypto_aes_cmac_decrypt(rx);
 		break;
+	default:
+		/*
+		 * We can reach here only with HW-only algorithms
+		 * but why didn't it decrypt the frame?!
+		 */
+		return RX_DROP_UNUSABLE;
 	}
 
 	/* either the frame has been decrypted or will be dropped */
@@ -1079,7 +1149,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
 		sta->last_rx = jiffies;
 	}
 
-	if (!(rx->flags & IEEE80211_RX_RA_MATCH))
+	if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
 		return RX_CONTINUE;
 
 	if (rx->sdata->vif.type == NL80211_IFTYPE_STATION)
@@ -1236,6 +1306,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 	unsigned int frag, seq;
 	struct ieee80211_fragment_entry *entry;
 	struct sk_buff *skb;
+	struct ieee80211_rx_status *status;
 
 	hdr = (struct ieee80211_hdr *)rx->skb->data;
 	fc = hdr->frame_control;
@@ -1265,13 +1336,15 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 		/* This is the first fragment of a new frame. */
 		entry = ieee80211_reassemble_add(rx->sdata, frag, seq,
 						 rx->queue, &(rx->skb));
-		if (rx->key && rx->key->conf.alg == ALG_CCMP &&
+		if (rx->key && rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP &&
 		    ieee80211_has_protected(fc)) {
+			int queue = ieee80211_is_mgmt(fc) ?
+				NUM_RX_DATA_QUEUES : rx->queue;
 			/* Store CCMP PN so that we can verify that the next
 			 * fragment has a sequential PN value. */
 			entry->ccmp = 1;
 			memcpy(entry->last_pn,
-			       rx->key->u.ccmp.rx_pn[rx->queue],
+			       rx->key->u.ccmp.rx_pn[queue],
 			       CCMP_PN_LEN);
 		}
 		return RX_QUEUED;
@@ -1291,7 +1364,8 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 	if (entry->ccmp) {
 		int i;
 		u8 pn[CCMP_PN_LEN], *rpn;
-		if (!rx->key || rx->key->conf.alg != ALG_CCMP)
+		int queue;
+		if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP)
 			return RX_DROP_UNUSABLE;
 		memcpy(pn, entry->last_pn, CCMP_PN_LEN);
 		for (i = CCMP_PN_LEN - 1; i >= 0; i--) {
@@ -1299,7 +1373,9 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 			if (pn[i])
 				break;
 		}
-		rpn = rx->key->u.ccmp.rx_pn[rx->queue];
+		queue = ieee80211_is_mgmt(fc) ?
+			NUM_RX_DATA_QUEUES : rx->queue;
+		rpn = rx->key->u.ccmp.rx_pn[queue];
 		if (memcmp(pn, rpn, CCMP_PN_LEN))
 			return RX_DROP_UNUSABLE;
 		memcpy(entry->last_pn, pn, CCMP_PN_LEN);
@@ -1330,7 +1406,8 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 	}
 
 	/* Complete frame has been reassembled - process it now */
-	rx->flags |= IEEE80211_RX_FRAGMENTED;
+	status = IEEE80211_SKB_RXCB(rx->skb);
+	status->rx_flags |= IEEE80211_RX_FRAGMENTED;
 
  out:
 	if (rx->sta)
@@ -1347,9 +1424,10 @@ ieee80211_rx_h_ps_poll(struct ieee80211_rx_data *rx)
 {
 	struct ieee80211_sub_if_data *sdata = rx->sdata;
 	__le16 fc = ((struct ieee80211_hdr *)rx->skb->data)->frame_control;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
 
 	if (likely(!rx->sta || !ieee80211_is_pspoll(fc) ||
-		   !(rx->flags & IEEE80211_RX_RA_MATCH)))
+		   !(status->rx_flags & IEEE80211_RX_RA_MATCH)))
 		return RX_CONTINUE;
 
 	if ((sdata->vif.type != NL80211_IFTYPE_AP) &&
@@ -1487,7 +1565,7 @@ static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx, __le16 fc)
 	 * Allow EAPOL frames to us/the PAE group address regardless
 	 * of whether the frame was encrypted or not.
 	 */
-	if (ehdr->h_proto == htons(ETH_P_PAE) &&
+	if (ehdr->h_proto == rx->sdata->control_port_protocol &&
 	    (compare_ether_addr(ehdr->h_dest, rx->sdata->vif.addr) == 0 ||
 	     compare_ether_addr(ehdr->h_dest, pae_group_addr) == 0))
 		return true;
@@ -1510,6 +1588,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
 	struct sk_buff *skb, *xmit_skb;
 	struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data;
 	struct sta_info *dsta;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
 
 	skb = rx->skb;
 	xmit_skb = NULL;
@@ -1517,7 +1596,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
 	if ((sdata->vif.type == NL80211_IFTYPE_AP ||
 	     sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
 	    !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
-	    (rx->flags & IEEE80211_RX_RA_MATCH) &&
+	    (status->rx_flags & IEEE80211_RX_RA_MATCH) &&
 	    (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) {
 		if (is_multicast_ether_addr(ehdr->h_dest)) {
 			/*
@@ -1573,7 +1652,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
 			/* deliver to local stack */
 			skb->protocol = eth_type_trans(skb, dev);
 			memset(skb->cb, 0, sizeof(skb->cb));
-			netif_rx(skb);
+			netif_receive_skb(skb);
 		}
 	}
 
@@ -1594,6 +1673,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
 	__le16 fc = hdr->frame_control;
 	struct sk_buff_head frame_list;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
 
 	if (unlikely(!ieee80211_is_data(fc)))
 		return RX_CONTINUE;
@@ -1601,7 +1681,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
 	if (unlikely(!ieee80211_is_data_present(fc)))
 		return RX_DROP_MONITOR;
 
-	if (!(rx->flags & IEEE80211_RX_AMSDU))
+	if (!(status->rx_flags & IEEE80211_RX_AMSDU))
 		return RX_CONTINUE;
 
 	if (ieee80211_has_a4(hdr->frame_control) &&
@@ -1652,6 +1732,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
 	struct sk_buff *skb = rx->skb, *fwd_skb;
 	struct ieee80211_local *local = rx->local;
 	struct ieee80211_sub_if_data *sdata = rx->sdata;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
 
 	hdr = (struct ieee80211_hdr *) skb->data;
 	hdrlen = ieee80211_hdrlen(hdr->frame_control);
@@ -1697,7 +1778,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
 
 	mesh_hdr->ttl--;
 
-	if (rx->flags & IEEE80211_RX_RA_MATCH) {
+	if (status->rx_flags & IEEE80211_RX_RA_MATCH) {
 		if (!mesh_hdr->ttl)
 			IEEE80211_IFSTA_MESH_CTR_INC(&rx->sdata->u.mesh,
 						     dropped_frames_ttl);
@@ -1829,13 +1910,11 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames)
 				  &bar_data, sizeof(bar_data)))
 			return RX_DROP_MONITOR;
 
-		spin_lock(&rx->sta->lock);
 		tid = le16_to_cpu(bar_data.control) >> 12;
-		if (!rx->sta->ampdu_mlme.tid_active_rx[tid]) {
-			spin_unlock(&rx->sta->lock);
+
+		tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]);
+		if (!tid_agg_rx)
 			return RX_DROP_MONITOR;
-		}
-		tid_agg_rx = rx->sta->ampdu_mlme.tid_rx[tid];
 
 		start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4;
 
@@ -1848,11 +1927,15 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames)
 		ieee80211_release_reorder_frames(hw, tid_agg_rx, start_seq_num,
 						 frames);
 		kfree_skb(skb);
-		spin_unlock(&rx->sta->lock);
 		return RX_QUEUED;
 	}
 
-	return RX_CONTINUE;
+	/*
+	 * After this point, we only want management frames,
+	 * so we can drop all remaining control frames to
+	 * cooked monitor interfaces.
+	 */
+	return RX_DROP_MONITOR;
 }
 
 static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
@@ -1902,13 +1985,38 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
 }
 
 static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+	/*
+	 * From here on, look only at management frames.
+	 * Data and control frames are already handled,
+	 * and unknown (reserved) frames are useless.
+	 */
+	if (rx->skb->len < 24)
+		return RX_DROP_MONITOR;
+
+	if (!ieee80211_is_mgmt(mgmt->frame_control))
+		return RX_DROP_MONITOR;
+
+	if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
+		return RX_DROP_MONITOR;
+
+	if (ieee80211_drop_unencrypted_mgmt(rx))
+		return RX_DROP_UNUSABLE;
+
+	return RX_CONTINUE;
+}
+
+static ieee80211_rx_result debug_noinline
 ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 {
 	struct ieee80211_local *local = rx->local;
 	struct ieee80211_sub_if_data *sdata = rx->sdata;
 	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
-	struct sk_buff *nskb;
-	struct ieee80211_rx_status *status;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
 	int len = rx->skb->len;
 
 	if (!ieee80211_is_action(mgmt->frame_control))
@@ -1921,10 +2029,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 	if (!rx->sta && mgmt->u.action.category != WLAN_CATEGORY_PUBLIC)
 		return RX_DROP_UNUSABLE;
 
-	if (!(rx->flags & IEEE80211_RX_RA_MATCH))
-		return RX_DROP_UNUSABLE;
-
-	if (ieee80211_drop_unencrypted_mgmt(rx))
+	if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
 		return RX_DROP_UNUSABLE;
 
 	switch (mgmt->u.action.category) {
@@ -1944,30 +2049,27 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 		if (len < IEEE80211_MIN_ACTION_SIZE + 1)
 			break;
 
-		if (sdata->vif.type == NL80211_IFTYPE_STATION)
-			return ieee80211_sta_rx_mgmt(sdata, rx->skb);
-
 		switch (mgmt->u.action.u.addba_req.action_code) {
 		case WLAN_ACTION_ADDBA_REQ:
 			if (len < (IEEE80211_MIN_ACTION_SIZE +
 				   sizeof(mgmt->u.action.u.addba_req)))
-				return RX_DROP_MONITOR;
-			ieee80211_process_addba_request(local, rx->sta, mgmt, len);
-			goto handled;
+				goto invalid;
+			break;
 		case WLAN_ACTION_ADDBA_RESP:
 			if (len < (IEEE80211_MIN_ACTION_SIZE +
 				   sizeof(mgmt->u.action.u.addba_resp)))
-				break;
-			ieee80211_process_addba_resp(local, rx->sta, mgmt, len);
-			goto handled;
+				goto invalid;
+			break;
 		case WLAN_ACTION_DELBA:
 			if (len < (IEEE80211_MIN_ACTION_SIZE +
 				   sizeof(mgmt->u.action.u.delba)))
-				break;
-			ieee80211_process_delba(sdata, rx->sta, mgmt, len);
-			goto handled;
+				goto invalid;
+			break;
+		default:
+			goto invalid;
 		}
-		break;
+
+		goto queue;
 	case WLAN_CATEGORY_SPECTRUM_MGMT:
 		if (local->hw.conf.channel->band != IEEE80211_BAND_5GHZ)
 			break;
@@ -1997,7 +2099,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 			if (memcmp(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN))
 				break;
 
-			return ieee80211_sta_rx_mgmt(sdata, rx->skb);
+			goto queue;
 		}
 		break;
 	case WLAN_CATEGORY_SA_QUERY:
@@ -2015,21 +2117,41 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 		break;
 	case WLAN_CATEGORY_MESH_PLINK:
 	case WLAN_CATEGORY_MESH_PATH_SEL:
-		if (ieee80211_vif_is_mesh(&sdata->vif))
-			return ieee80211_mesh_rx_mgmt(sdata, rx->skb);
-		break;
+		if (!ieee80211_vif_is_mesh(&sdata->vif))
+			break;
+		goto queue;
 	}
 
-	/*
-	 * For AP mode, hostapd is responsible for handling any action
-	 * frames that we didn't handle, including returning unknown
-	 * ones. For all other modes we will return them to the sender,
-	 * setting the 0x80 bit in the action category, as required by
-	 * 802.11-2007 7.3.1.11.
-	 */
-	if (sdata->vif.type == NL80211_IFTYPE_AP ||
-	    sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-		return RX_DROP_MONITOR;
+	return RX_CONTINUE;
+
+ invalid:
+	status->rx_flags |= IEEE80211_RX_MALFORMED_ACTION_FRM;
+	/* will return in the next handlers */
+	return RX_CONTINUE;
+
+ handled:
+	if (rx->sta)
+		rx->sta->rx_packets++;
+	dev_kfree_skb(rx->skb);
+	return RX_QUEUED;
+
+ queue:
+	rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
+	skb_queue_tail(&sdata->skb_queue, rx->skb);
+	ieee80211_queue_work(&local->hw, &sdata->work);
+	if (rx->sta)
+		rx->sta->rx_packets++;
+	return RX_QUEUED;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+	/* skip known-bad action frames and return them in the next handler */
+	if (status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM)
+		return RX_CONTINUE;
 
 	/*
 	 * Getting here means the kernel doesn't know how to handle
@@ -2037,13 +2159,46 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 	 * so userspace can register for those to know whether ones
 	 * it transmitted were processed or returned.
 	 */
-	status = IEEE80211_SKB_RXCB(rx->skb);
 
-	if (sdata->vif.type == NL80211_IFTYPE_STATION &&
-	    cfg80211_rx_action(rx->sdata->dev, status->freq,
-			       rx->skb->data, rx->skb->len,
-			       GFP_ATOMIC))
-		goto handled;
+	if (cfg80211_rx_mgmt(rx->sdata->dev, status->freq,
+			     rx->skb->data, rx->skb->len,
+			     GFP_ATOMIC)) {
+		if (rx->sta)
+			rx->sta->rx_packets++;
+		dev_kfree_skb(rx->skb);
+		return RX_QUEUED;
+	}
+
+
+	return RX_CONTINUE;
+}
+
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_local *local = rx->local;
+	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
+	struct sk_buff *nskb;
+	struct ieee80211_sub_if_data *sdata = rx->sdata;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+
+	if (!ieee80211_is_action(mgmt->frame_control))
+		return RX_CONTINUE;
+
+	/*
+	 * For AP mode, hostapd is responsible for handling any action
+	 * frames that we didn't handle, including returning unknown
+	 * ones. For all other modes we will return them to the sender,
+	 * setting the 0x80 bit in the action category, as required by
+	 * 802.11-2007 7.3.1.11.
+	 * Newer versions of hostapd shall also use the management frame
+	 * registration mechanisms, but older ones still use cooked
+	 * monitor interfaces so push all frames there.
+	 */
+	if (!(status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM) &&
+	    (sdata->vif.type == NL80211_IFTYPE_AP ||
+	     sdata->vif.type == NL80211_IFTYPE_AP_VLAN))
+		return RX_DROP_MONITOR;
 
 	/* do not return rejected action frames */
 	if (mgmt->u.action.category & 0x80)
@@ -2052,20 +2207,16 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 	nskb = skb_copy_expand(rx->skb, local->hw.extra_tx_headroom, 0,
 			       GFP_ATOMIC);
 	if (nskb) {
-		struct ieee80211_mgmt *mgmt = (void *)nskb->data;
+		struct ieee80211_mgmt *nmgmt = (void *)nskb->data;
 
-		mgmt->u.action.category |= 0x80;
-		memcpy(mgmt->da, mgmt->sa, ETH_ALEN);
-		memcpy(mgmt->sa, rx->sdata->vif.addr, ETH_ALEN);
+		nmgmt->u.action.category |= 0x80;
+		memcpy(nmgmt->da, nmgmt->sa, ETH_ALEN);
+		memcpy(nmgmt->sa, rx->sdata->vif.addr, ETH_ALEN);
 
 		memset(nskb->cb, 0, sizeof(nskb->cb));
 
 		ieee80211_tx_skb(rx->sdata, nskb);
 	}
-
- handled:
-	if (rx->sta)
-		rx->sta->rx_packets++;
 	dev_kfree_skb(rx->skb);
 	return RX_QUEUED;
 }
@@ -2075,27 +2226,49 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
 {
 	struct ieee80211_sub_if_data *sdata = rx->sdata;
 	ieee80211_rx_result rxs;
-
-	if (!(rx->flags & IEEE80211_RX_RA_MATCH))
-		return RX_DROP_MONITOR;
-
-	if (ieee80211_drop_unencrypted_mgmt(rx))
-		return RX_DROP_UNUSABLE;
+	struct ieee80211_mgmt *mgmt = (void *)rx->skb->data;
+	__le16 stype;
 
 	rxs = ieee80211_work_rx_mgmt(rx->sdata, rx->skb);
 	if (rxs != RX_CONTINUE)
 		return rxs;
 
-	if (ieee80211_vif_is_mesh(&sdata->vif))
-		return ieee80211_mesh_rx_mgmt(sdata, rx->skb);
+	stype = mgmt->frame_control & cpu_to_le16(IEEE80211_FCTL_STYPE);
 
-	if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
-		return ieee80211_ibss_rx_mgmt(sdata, rx->skb);
+	if (!ieee80211_vif_is_mesh(&sdata->vif) &&
+	    sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+	    sdata->vif.type != NL80211_IFTYPE_STATION)
+		return RX_DROP_MONITOR;
 
-	if (sdata->vif.type == NL80211_IFTYPE_STATION)
-		return ieee80211_sta_rx_mgmt(sdata, rx->skb);
+	switch (stype) {
+	case cpu_to_le16(IEEE80211_STYPE_BEACON):
+	case cpu_to_le16(IEEE80211_STYPE_PROBE_RESP):
+		/* process for all: mesh, mlme, ibss */
+		break;
+	case cpu_to_le16(IEEE80211_STYPE_DEAUTH):
+	case cpu_to_le16(IEEE80211_STYPE_DISASSOC):
+		/* process only for station */
+		if (sdata->vif.type != NL80211_IFTYPE_STATION)
+			return RX_DROP_MONITOR;
+		break;
+	case cpu_to_le16(IEEE80211_STYPE_PROBE_REQ):
+	case cpu_to_le16(IEEE80211_STYPE_AUTH):
+		/* process only for ibss */
+		if (sdata->vif.type != NL80211_IFTYPE_ADHOC)
+			return RX_DROP_MONITOR;
+		break;
+	default:
+		return RX_DROP_MONITOR;
+	}
 
-	return RX_DROP_MONITOR;
+	/* queue up frame and kick off work to process it */
+	rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
+	skb_queue_tail(&sdata->skb_queue, rx->skb);
+	ieee80211_queue_work(&rx->local->hw, &sdata->work);
+	if (rx->sta)
+		rx->sta->rx_packets++;
+
+	return RX_QUEUED;
 }
 
 static void ieee80211_rx_michael_mic_report(struct ieee80211_hdr *hdr,
@@ -2151,13 +2324,18 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
 		u8 rate_or_pad;
 		__le16 chan_freq;
 		__le16 chan_flags;
-	} __attribute__ ((packed)) *rthdr;
+	} __packed *rthdr;
 	struct sk_buff *skb = rx->skb, *skb2;
 	struct net_device *prev_dev = NULL;
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
 
-	if (status->flag & RX_FLAG_INTERNAL_CMTR)
+	/*
+	 * If cooked monitor has been processed already, then
+	 * don't do it again. If not, set the flag.
+	 */
+	if (rx->flags & IEEE80211_RX_CMNTR)
 		goto out_free_skb;
+	rx->flags |= IEEE80211_RX_CMNTR;
 
 	if (skb_headroom(skb) < sizeof(*rthdr) &&
 	    pskb_expand_head(skb, sizeof(*rthdr), 0, GFP_ATOMIC))
@@ -2201,7 +2379,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
 			skb2 = skb_clone(skb, GFP_ATOMIC);
 			if (skb2) {
 				skb2->dev = prev_dev;
-				netif_rx(skb2);
+				netif_receive_skb(skb2);
 			}
 		}
 
@@ -2212,31 +2390,54 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
 
 	if (prev_dev) {
 		skb->dev = prev_dev;
-		netif_rx(skb);
-		skb = NULL;
-	} else
-		goto out_free_skb;
-
-	status->flag |= RX_FLAG_INTERNAL_CMTR;
-	return;
+		netif_receive_skb(skb);
+		return;
+	}
 
  out_free_skb:
 	dev_kfree_skb(skb);
 }
 
+static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx,
+					 ieee80211_rx_result res)
+{
+	switch (res) {
+	case RX_DROP_MONITOR:
+		I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
+		if (rx->sta)
+			rx->sta->rx_dropped++;
+		/* fall through */
+	case RX_CONTINUE: {
+		struct ieee80211_rate *rate = NULL;
+		struct ieee80211_supported_band *sband;
+		struct ieee80211_rx_status *status;
+
+		status = IEEE80211_SKB_RXCB((rx->skb));
+
+		sband = rx->local->hw.wiphy->bands[status->band];
+		if (!(status->flag & RX_FLAG_HT))
+			rate = &sband->bitrates[status->rate_idx];
+
+		ieee80211_rx_cooked_monitor(rx, rate);
+		break;
+		}
+	case RX_DROP_UNUSABLE:
+		I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
+		if (rx->sta)
+			rx->sta->rx_dropped++;
+		dev_kfree_skb(rx->skb);
+		break;
+	case RX_QUEUED:
+		I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued);
+		break;
+	}
+}
 
-static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata,
-					 struct ieee80211_rx_data *rx,
-					 struct sk_buff *skb,
-					 struct ieee80211_rate *rate)
+static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
+				  struct sk_buff_head *frames)
 {
-	struct sk_buff_head reorder_release;
 	ieee80211_rx_result res = RX_DROP_MONITOR;
-
-	__skb_queue_head_init(&reorder_release);
-
-	rx->skb = skb;
-	rx->sdata = sdata;
+	struct sk_buff *skb;
 
 #define CALL_RXH(rxh)			\
 	do {				\
@@ -2245,23 +2446,14 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata,
 			goto rxh_next;  \
 	} while (0);
 
-	/*
-	 * NB: the rxh_next label works even if we jump
-	 *     to it from here because then the list will
-	 *     be empty, which is a trivial check
-	 */
-	CALL_RXH(ieee80211_rx_h_passive_scan)
-	CALL_RXH(ieee80211_rx_h_check)
-
-	ieee80211_rx_reorder_ampdu(rx, &reorder_release);
-
-	while ((skb = __skb_dequeue(&reorder_release))) {
+	while ((skb = __skb_dequeue(frames))) {
 		/*
 		 * all the other fields are valid across frames
 		 * that belong to an aMPDU since they are on the
 		 * same TID from the same station
 		 */
 		rx->skb = skb;
+		rx->flags = 0;
 
 		CALL_RXH(ieee80211_rx_h_decrypt)
 		CALL_RXH(ieee80211_rx_h_check_more_data)
@@ -2273,50 +2465,92 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata,
 		CALL_RXH(ieee80211_rx_h_remove_qos_control)
 		CALL_RXH(ieee80211_rx_h_amsdu)
 #ifdef CONFIG_MAC80211_MESH
-		if (ieee80211_vif_is_mesh(&sdata->vif))
+		if (ieee80211_vif_is_mesh(&rx->sdata->vif))
 			CALL_RXH(ieee80211_rx_h_mesh_fwding);
 #endif
 		CALL_RXH(ieee80211_rx_h_data)
 
 		/* special treatment -- needs the queue */
-		res = ieee80211_rx_h_ctrl(rx, &reorder_release);
+		res = ieee80211_rx_h_ctrl(rx, frames);
 		if (res != RX_CONTINUE)
 			goto rxh_next;
 
+		CALL_RXH(ieee80211_rx_h_mgmt_check)
 		CALL_RXH(ieee80211_rx_h_action)
+		CALL_RXH(ieee80211_rx_h_userspace_mgmt)
+		CALL_RXH(ieee80211_rx_h_action_return)
 		CALL_RXH(ieee80211_rx_h_mgmt)
 
+ rxh_next:
+		ieee80211_rx_handlers_result(rx, res);
+
 #undef CALL_RXH
+	}
+}
+
+static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx)
+{
+	struct sk_buff_head reorder_release;
+	ieee80211_rx_result res = RX_DROP_MONITOR;
+
+	__skb_queue_head_init(&reorder_release);
+
+#define CALL_RXH(rxh)			\
+	do {				\
+		res = rxh(rx);		\
+		if (res != RX_CONTINUE)	\
+			goto rxh_next;  \
+	} while (0);
+
+	CALL_RXH(ieee80211_rx_h_passive_scan)
+	CALL_RXH(ieee80211_rx_h_check)
+
+	ieee80211_rx_reorder_ampdu(rx, &reorder_release);
+
+	ieee80211_rx_handlers(rx, &reorder_release);
+	return;
 
  rxh_next:
-		switch (res) {
-		case RX_DROP_MONITOR:
-			I802_DEBUG_INC(sdata->local->rx_handlers_drop);
-			if (rx->sta)
-				rx->sta->rx_dropped++;
-			/* fall through */
-		case RX_CONTINUE:
-			ieee80211_rx_cooked_monitor(rx, rate);
-			break;
-		case RX_DROP_UNUSABLE:
-			I802_DEBUG_INC(sdata->local->rx_handlers_drop);
-			if (rx->sta)
-				rx->sta->rx_dropped++;
-			dev_kfree_skb(rx->skb);
-			break;
-		case RX_QUEUED:
-			I802_DEBUG_INC(sdata->local->rx_handlers_queued);
-			break;
-		}
-	}
+	ieee80211_rx_handlers_result(rx, res);
+
+#undef CALL_RXH
+}
+
+/*
+ * This function makes calls into the RX path. Therefore the
+ * caller must hold the sta_info->lock and everything has to
+ * be under rcu_read_lock protection as well.
+ */
+void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid)
+{
+	struct sk_buff_head frames;
+	struct ieee80211_rx_data rx = {
+		.sta = sta,
+		.sdata = sta->sdata,
+		.local = sta->local,
+		.queue = tid,
+	};
+	struct tid_ampdu_rx *tid_agg_rx;
+
+	tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
+	if (!tid_agg_rx)
+		return;
+
+	__skb_queue_head_init(&frames);
+
+	spin_lock(&tid_agg_rx->reorder_lock);
+	ieee80211_sta_reorder_release(&sta->local->hw, tid_agg_rx, &frames);
+	spin_unlock(&tid_agg_rx->reorder_lock);
+
+	ieee80211_rx_handlers(&rx, &frames);
 }
 
 /* main receive path */
 
-static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
-				struct ieee80211_rx_data *rx,
+static int prepare_for_handlers(struct ieee80211_rx_data *rx,
 				struct ieee80211_hdr *hdr)
 {
+	struct ieee80211_sub_if_data *sdata = rx->sdata;
 	struct sk_buff *skb = rx->skb;
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
 	u8 *bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type);
@@ -2330,7 +2564,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
 		    compare_ether_addr(sdata->vif.addr, hdr->addr1) != 0) {
 			if (!(sdata->dev->flags & IFF_PROMISC))
 				return 0;
-			rx->flags &= ~IEEE80211_RX_RA_MATCH;
+			status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
 		}
 		break;
 	case NL80211_IFTYPE_ADHOC:
@@ -2340,15 +2574,15 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
 			return 1;
 		}
 		else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) {
-			if (!(rx->flags & IEEE80211_RX_IN_SCAN))
+			if (!(status->rx_flags & IEEE80211_RX_IN_SCAN))
 				return 0;
-			rx->flags &= ~IEEE80211_RX_RA_MATCH;
+			status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
 		} else if (!multicast &&
 			   compare_ether_addr(sdata->vif.addr,
 					      hdr->addr1) != 0) {
 			if (!(sdata->dev->flags & IFF_PROMISC))
 				return 0;
-			rx->flags &= ~IEEE80211_RX_RA_MATCH;
+			status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
 		} else if (!rx->sta) {
 			int rate_idx;
 			if (status->flag & RX_FLAG_HT)
@@ -2366,7 +2600,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
 			if (!(sdata->dev->flags & IFF_PROMISC))
 				return 0;
 
-			rx->flags &= ~IEEE80211_RX_RA_MATCH;
+			status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
 		}
 		break;
 	case NL80211_IFTYPE_AP_VLAN:
@@ -2377,9 +2611,9 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
 				return 0;
 		} else if (!ieee80211_bssid_match(bssid,
 					sdata->vif.addr)) {
-			if (!(rx->flags & IEEE80211_RX_IN_SCAN))
+			if (!(status->rx_flags & IEEE80211_RX_IN_SCAN))
 				return 0;
-			rx->flags &= ~IEEE80211_RX_RA_MATCH;
+			status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
 		}
 		break;
 	case NL80211_IFTYPE_WDS:
@@ -2388,9 +2622,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
 		if (compare_ether_addr(sdata->u.wds.remote_addr, hdr->addr2))
 			return 0;
 		break;
-	case NL80211_IFTYPE_MONITOR:
-	case NL80211_IFTYPE_UNSPECIFIED:
-	case __NL80211_IFTYPE_AFTER_LAST:
+	default:
 		/* should never get here */
 		WARN_ON(1);
 		break;
@@ -2400,12 +2632,56 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
 }
 
 /*
+ * This function returns whether or not the SKB
+ * was destined for RX processing or not, which,
+ * if consume is true, is equivalent to whether
+ * or not the skb was consumed.
+ */
+static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
+					    struct sk_buff *skb, bool consume)
+{
+	struct ieee80211_local *local = rx->local;
+	struct ieee80211_sub_if_data *sdata = rx->sdata;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct ieee80211_hdr *hdr = (void *)skb->data;
+	int prepares;
+
+	rx->skb = skb;
+	status->rx_flags |= IEEE80211_RX_RA_MATCH;
+	prepares = prepare_for_handlers(rx, hdr);
+
+	if (!prepares)
+		return false;
+
+	if (status->flag & RX_FLAG_MMIC_ERROR) {
+		if (status->rx_flags & IEEE80211_RX_RA_MATCH)
+			ieee80211_rx_michael_mic_report(hdr, rx);
+		return false;
+	}
+
+	if (!consume) {
+		skb = skb_copy(skb, GFP_ATOMIC);
+		if (!skb) {
+			if (net_ratelimit())
+				wiphy_debug(local->hw.wiphy,
+					"failed to copy multicast frame for %s\n",
+					sdata->name);
+			return true;
+		}
+
+		rx->skb = skb;
+	}
+
+	ieee80211_invoke_rx_handlers(rx);
+	return true;
+}
+
+/*
  * This is the actual Rx frames handler. as it blongs to Rx path it must
  * be called with rcu_read_lock protection.
  */
 static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
-					 struct sk_buff *skb,
-					 struct ieee80211_rate *rate)
+					 struct sk_buff *skb)
 {
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
 	struct ieee80211_local *local = hw_to_local(hw);
@@ -2413,11 +2689,8 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
 	struct ieee80211_hdr *hdr;
 	__le16 fc;
 	struct ieee80211_rx_data rx;
-	int prepares;
-	struct ieee80211_sub_if_data *prev = NULL;
-	struct sk_buff *skb_new;
-	struct sta_info *sta, *tmp;
-	bool found_sta = false;
+	struct ieee80211_sub_if_data *prev;
+	struct sta_info *sta, *tmp, *prev_sta;
 	int err = 0;
 
 	fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
@@ -2430,7 +2703,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
 
 	if (unlikely(test_bit(SCAN_HW_SCANNING, &local->scanning) ||
 		     test_bit(SCAN_OFF_CHANNEL, &local->scanning)))
-		rx.flags |= IEEE80211_RX_IN_SCAN;
+		status->rx_flags |= IEEE80211_RX_IN_SCAN;
 
 	if (ieee80211_is_mgmt(fc))
 		err = skb_linearize(skb);
@@ -2447,91 +2720,67 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
 	ieee80211_verify_alignment(&rx);
 
 	if (ieee80211_is_data(fc)) {
+		prev_sta = NULL;
+
 		for_each_sta_info(local, hdr->addr2, sta, tmp) {
-			rx.sta = sta;
-			found_sta = true;
-			rx.sdata = sta->sdata;
-
-			rx.flags |= IEEE80211_RX_RA_MATCH;
-			prepares = prepare_for_handlers(rx.sdata, &rx, hdr);
-			if (prepares) {
-				if (status->flag & RX_FLAG_MMIC_ERROR) {
-					if (rx.flags & IEEE80211_RX_RA_MATCH)
-						ieee80211_rx_michael_mic_report(hdr, &rx);
-				} else
-					prev = rx.sdata;
-			}
-		}
-	}
-	if (!found_sta) {
-		list_for_each_entry_rcu(sdata, &local->interfaces, list) {
-			if (!ieee80211_sdata_running(sdata))
+			if (!prev_sta) {
+				prev_sta = sta;
 				continue;
+			}
 
-			if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
-			    sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-				continue;
+			rx.sta = prev_sta;
+			rx.sdata = prev_sta->sdata;
+			ieee80211_prepare_and_rx_handle(&rx, skb, false);
 
-			/*
-			 * frame is destined for this interface, but if it's
-			 * not also for the previous one we handle that after
-			 * the loop to avoid copying the SKB once too much
-			 */
+			prev_sta = sta;
+		}
 
-			if (!prev) {
-				prev = sdata;
-				continue;
-			}
+		if (prev_sta) {
+			rx.sta = prev_sta;
+			rx.sdata = prev_sta->sdata;
 
-			rx.sta = sta_info_get_bss(prev, hdr->addr2);
+			if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
+				return;
+		}
+	}
 
-			rx.flags |= IEEE80211_RX_RA_MATCH;
-			prepares = prepare_for_handlers(prev, &rx, hdr);
+	prev = NULL;
 
-			if (!prepares)
-				goto next;
+	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+		if (!ieee80211_sdata_running(sdata))
+			continue;
 
-			if (status->flag & RX_FLAG_MMIC_ERROR) {
-				rx.sdata = prev;
-				if (rx.flags & IEEE80211_RX_RA_MATCH)
-					ieee80211_rx_michael_mic_report(hdr,
-									&rx);
-				goto next;
-			}
+		if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
+		    sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+			continue;
 
-			/*
-			 * frame was destined for the previous interface
-			 * so invoke RX handlers for it
-			 */
+		/*
+		 * frame is destined for this interface, but if it's
+		 * not also for the previous one we handle that after
+		 * the loop to avoid copying the SKB once too much
+		 */
 
-			skb_new = skb_copy(skb, GFP_ATOMIC);
-			if (!skb_new) {
-				if (net_ratelimit())
-					printk(KERN_DEBUG "%s: failed to copy "
-					       "multicast frame for %s\n",
-					       wiphy_name(local->hw.wiphy),
-					       prev->name);
-				goto next;
-			}
-			ieee80211_invoke_rx_handlers(prev, &rx, skb_new, rate);
-next:
+		if (!prev) {
 			prev = sdata;
+			continue;
 		}
 
-		if (prev) {
-			rx.sta = sta_info_get_bss(prev, hdr->addr2);
+		rx.sta = sta_info_get_bss(prev, hdr->addr2);
+		rx.sdata = prev;
+		ieee80211_prepare_and_rx_handle(&rx, skb, false);
 
-			rx.flags |= IEEE80211_RX_RA_MATCH;
-			prepares = prepare_for_handlers(prev, &rx, hdr);
+		prev = sdata;
+	}
 
-			if (!prepares)
-				prev = NULL;
-		}
+	if (prev) {
+		rx.sta = sta_info_get_bss(prev, hdr->addr2);
+		rx.sdata = prev;
+
+		if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
+			return;
 	}
-	if (prev)
-		ieee80211_invoke_rx_handlers(prev, &rx, skb, rate);
-	else
-		dev_kfree_skb(skb);
+
+	dev_kfree_skb(skb);
 }
 
 /*
@@ -2572,30 +2821,41 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
 	if (WARN_ON(!local->started))
 		goto drop;
 
-	if (status->flag & RX_FLAG_HT) {
+	if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC))) {
 		/*
-		 * rate_idx is MCS index, which can be [0-76] as documented on:
-		 *
-		 * http://wireless.kernel.org/en/developers/Documentation/ieee80211/802.11n
-		 *
-		 * Anything else would be some sort of driver or hardware error.
-		 * The driver should catch hardware errors.
+		 * Validate the rate, unless a PLCP error means that
+		 * we probably can't have a valid rate here anyway.
 		 */
-		if (WARN((status->rate_idx < 0 ||
-			 status->rate_idx > 76),
-			 "Rate marked as an HT rate but passed "
-			 "status->rate_idx is not "
-			 "an MCS index [0-76]: %d (0x%02x)\n",
-			 status->rate_idx,
-			 status->rate_idx))
-			goto drop;
-	} else {
-		if (WARN_ON(status->rate_idx < 0 ||
-			    status->rate_idx >= sband->n_bitrates))
-			goto drop;
-		rate = &sband->bitrates[status->rate_idx];
+
+		if (status->flag & RX_FLAG_HT) {
+			/*
+			 * rate_idx is MCS index, which can be [0-76]
+			 * as documented on:
+			 *
+			 * http://wireless.kernel.org/en/developers/Documentation/ieee80211/802.11n
+			 *
+			 * Anything else would be some sort of driver or
+			 * hardware error. The driver should catch hardware
+			 * errors.
+			 */
+			if (WARN((status->rate_idx < 0 ||
+				 status->rate_idx > 76),
+				 "Rate marked as an HT rate but passed "
+				 "status->rate_idx is not "
+				 "an MCS index [0-76]: %d (0x%02x)\n",
+				 status->rate_idx,
+				 status->rate_idx))
+				goto drop;
+		} else {
+			if (WARN_ON(status->rate_idx < 0 ||
+				    status->rate_idx >= sband->n_bitrates))
+				goto drop;
+			rate = &sband->bitrates[status->rate_idx];
+		}
 	}
 
+	status->rx_flags = 0;
+
 	/*
 	 * key references and virtual interfaces are protected using RCU
 	 * and this requires that we are in a read-side RCU section during
@@ -2615,7 +2875,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
 		return;
 	}
 
-	__ieee80211_rx_handle_packet(hw, skb, rate);
+	__ieee80211_rx_handle_packet(hw, skb);
 
 	rcu_read_unlock();
 
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index e1b0be7a57b..fb274db77e3 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -114,6 +114,10 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
 		bss->dtim_period = tim_ie->dtim_period;
 	}
 
+	/* If the beacon had no TIM IE, or it was invalid, use 1 */
+	if (beacon && !bss->dtim_period)
+		bss->dtim_period = 1;
+
 	/* replace old supported rates if we get new values */
 	srlen = 0;
 	if (elems->supp_rates) {
@@ -238,20 +242,19 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local)
 	local->hw_scan_req->n_channels = n_chans;
 
 	ielen = ieee80211_build_preq_ies(local, (u8 *)local->hw_scan_req->ie,
-					 req->ie, req->ie_len, band);
+					 req->ie, req->ie_len, band, (u32) -1,
+					 0);
 	local->hw_scan_req->ie_len = ielen;
 
 	return true;
 }
 
-void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
+static bool __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted,
+				       bool was_hw_scan)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
-	bool was_hw_scan;
-
-	trace_api_scan_completed(local, aborted);
 
-	mutex_lock(&local->scan_mtx);
+	lockdep_assert_held(&local->mtx);
 
 	/*
 	 * It's ok to abort a not-yet-running scan (that
@@ -262,17 +265,13 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
 	if (WARN_ON(!local->scanning && !aborted))
 		aborted = true;
 
-	if (WARN_ON(!local->scan_req)) {
-		mutex_unlock(&local->scan_mtx);
-		return;
-	}
+	if (WARN_ON(!local->scan_req))
+		return false;
 
-	was_hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning);
 	if (was_hw_scan && !aborted && ieee80211_prep_hw_scan(local)) {
-		ieee80211_queue_delayed_work(&local->hw,
-					     &local->scan_work, 0);
-		mutex_unlock(&local->scan_mtx);
-		return;
+		int rc = drv_hw_scan(local, local->scan_sdata, local->hw_scan_req);
+		if (rc == 0)
+			return false;
 	}
 
 	kfree(local->hw_scan_req);
@@ -286,26 +285,42 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
 	local->scanning = 0;
 	local->scan_channel = NULL;
 
-	/* we only have to protect scan_req and hw/sw scan */
-	mutex_unlock(&local->scan_mtx);
-
-	ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
-	if (was_hw_scan)
-		goto done;
-
-	ieee80211_configure_filter(local);
+	return true;
+}
 
-	drv_sw_scan_complete(local);
+static void __ieee80211_scan_completed_finish(struct ieee80211_hw *hw,
+					      bool was_hw_scan)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
 
-	ieee80211_offchannel_return(local, true);
+	ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
+	if (!was_hw_scan) {
+		ieee80211_configure_filter(local);
+		drv_sw_scan_complete(local);
+		ieee80211_offchannel_return(local, true);
+	}
 
- done:
+	mutex_lock(&local->mtx);
 	ieee80211_recalc_idle(local);
+	mutex_unlock(&local->mtx);
+
 	ieee80211_mlme_notify_scan_completed(local);
 	ieee80211_ibss_notify_scan_completed(local);
 	ieee80211_mesh_notify_scan_completed(local);
 	ieee80211_queue_work(&local->hw, &local->work_work);
 }
+
+void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	trace_api_scan_completed(local, aborted);
+
+	set_bit(SCAN_COMPLETED, &local->scanning);
+	if (aborted)
+		set_bit(SCAN_ABORTED, &local->scanning);
+	ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0);
+}
 EXPORT_SYMBOL(ieee80211_scan_completed);
 
 static int ieee80211_start_sw_scan(struct ieee80211_local *local)
@@ -349,6 +364,8 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_local *local = sdata->local;
 	int rc;
 
+	lockdep_assert_held(&local->mtx);
+
 	if (local->scan_req)
 		return -EBUSY;
 
@@ -396,19 +413,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
 	else
 		__set_bit(SCAN_SW_SCANNING, &local->scanning);
 
-	/*
-	 * Kicking off the scan need not be protected,
-	 * only the scan variable stuff, since now
-	 * local->scan_req is assigned and other callers
-	 * will abort their scan attempts.
-	 *
-	 * This avoids too many locking dependencies
-	 * so that the scan completed calls have more
-	 * locking freedom.
-	 */
-
 	ieee80211_recalc_idle(local);
-	mutex_unlock(&local->scan_mtx);
 
 	if (local->ops->hw_scan) {
 		WARN_ON(!ieee80211_prep_hw_scan(local));
@@ -416,8 +421,6 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
 	} else
 		rc = ieee80211_start_sw_scan(local);
 
-	mutex_lock(&local->scan_mtx);
-
 	if (rc) {
 		kfree(local->hw_scan_req);
 		local->hw_scan_req = NULL;
@@ -444,8 +447,8 @@ ieee80211_scan_get_channel_time(struct ieee80211_channel *chan)
 	return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME;
 }
 
-static int ieee80211_scan_state_decision(struct ieee80211_local *local,
-					 unsigned long *next_delay)
+static void ieee80211_scan_state_decision(struct ieee80211_local *local,
+					  unsigned long *next_delay)
 {
 	bool associated = false;
 	bool tx_empty = true;
@@ -455,12 +458,6 @@ static int ieee80211_scan_state_decision(struct ieee80211_local *local,
 	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_channel *next_chan;
 
-	/* if no more bands/channels left, complete scan and advance to the idle state */
-	if (local->scan_channel_idx >= local->scan_req->n_channels) {
-		ieee80211_scan_completed(&local->hw, false);
-		return 1;
-	}
-
 	/*
 	 * check if at least one STA interface is associated,
 	 * check if at least one STA interface has pending tx frames
@@ -532,7 +529,6 @@ static int ieee80211_scan_state_decision(struct ieee80211_local *local,
 	}
 
 	*next_delay = 0;
-	return 0;
 }
 
 static void ieee80211_scan_state_leave_oper_channel(struct ieee80211_local *local,
@@ -648,21 +644,18 @@ void ieee80211_scan_work(struct work_struct *work)
 		container_of(work, struct ieee80211_local, scan_work.work);
 	struct ieee80211_sub_if_data *sdata = local->scan_sdata;
 	unsigned long next_delay = 0;
+	bool aborted, hw_scan, finish;
 
-	mutex_lock(&local->scan_mtx);
-	if (!sdata || !local->scan_req) {
-		mutex_unlock(&local->scan_mtx);
-		return;
-	}
+	mutex_lock(&local->mtx);
 
-	if (local->hw_scan_req) {
-		int rc = drv_hw_scan(local, sdata, local->hw_scan_req);
-		mutex_unlock(&local->scan_mtx);
-		if (rc)
-			ieee80211_scan_completed(&local->hw, true);
-		return;
+	if (test_and_clear_bit(SCAN_COMPLETED, &local->scanning)) {
+		aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning);
+		goto out_complete;
 	}
 
+	if (!sdata || !local->scan_req)
+		goto out;
+
 	if (local->scan_req && !local->scanning) {
 		struct cfg80211_scan_request *req = local->scan_req;
 		int rc;
@@ -671,21 +664,21 @@ void ieee80211_scan_work(struct work_struct *work)
 		local->scan_sdata = NULL;
 
 		rc = __ieee80211_start_scan(sdata, req);
-		mutex_unlock(&local->scan_mtx);
-
-		if (rc)
-			ieee80211_scan_completed(&local->hw, true);
-		return;
+		if (rc) {
+			/* need to complete scan in cfg80211 */
+			local->scan_req = req;
+			aborted = true;
+			goto out_complete;
+		} else
+			goto out;
 	}
 
-	mutex_unlock(&local->scan_mtx);
-
 	/*
 	 * Avoid re-scheduling when the sdata is going away.
 	 */
 	if (!ieee80211_sdata_running(sdata)) {
-		ieee80211_scan_completed(&local->hw, true);
-		return;
+		aborted = true;
+		goto out_complete;
 	}
 
 	/*
@@ -695,8 +688,12 @@ void ieee80211_scan_work(struct work_struct *work)
 	do {
 		switch (local->next_scan_state) {
 		case SCAN_DECISION:
-			if (ieee80211_scan_state_decision(local, &next_delay))
-				return;
+			/* if no more bands/channels left, complete scan */
+			if (local->scan_channel_idx >= local->scan_req->n_channels) {
+				aborted = false;
+				goto out_complete;
+			}
+			ieee80211_scan_state_decision(local, &next_delay);
 			break;
 		case SCAN_SET_CHANNEL:
 			ieee80211_scan_state_set_channel(local, &next_delay);
@@ -714,6 +711,19 @@ void ieee80211_scan_work(struct work_struct *work)
 	} while (next_delay == 0);
 
 	ieee80211_queue_delayed_work(&local->hw, &local->scan_work, next_delay);
+	mutex_unlock(&local->mtx);
+	return;
+
+out_complete:
+	hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning);
+	finish = __ieee80211_scan_completed(&local->hw, aborted, hw_scan);
+	mutex_unlock(&local->mtx);
+	if (finish)
+		__ieee80211_scan_completed_finish(&local->hw, hw_scan);
+	return;
+
+out:
+	mutex_unlock(&local->mtx);
 }
 
 int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
@@ -721,9 +731,9 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
 {
 	int res;
 
-	mutex_lock(&sdata->local->scan_mtx);
+	mutex_lock(&sdata->local->mtx);
 	res = __ieee80211_start_scan(sdata, req);
-	mutex_unlock(&sdata->local->scan_mtx);
+	mutex_unlock(&sdata->local->mtx);
 
 	return res;
 }
@@ -734,9 +744,9 @@ int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_local *local = sdata->local;
 	int ret = -EBUSY;
-	enum nl80211_band band;
+	enum ieee80211_band band;
 
-	mutex_lock(&local->scan_mtx);
+	mutex_lock(&local->mtx);
 
 	/* busy scanning */
 	if (local->scan_req)
@@ -771,25 +781,44 @@ int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata,
 
 	ret = __ieee80211_start_scan(sdata, sdata->local->int_scan_req);
  unlock:
-	mutex_unlock(&local->scan_mtx);
+	mutex_unlock(&local->mtx);
 	return ret;
 }
 
+/*
+ * Only call this function when a scan can't be queued -- under RTNL.
+ */
 void ieee80211_scan_cancel(struct ieee80211_local *local)
 {
 	bool abortscan;
-
-	cancel_delayed_work_sync(&local->scan_work);
+	bool finish = false;
 
 	/*
-	 * Only call this function when a scan can't be
-	 * queued -- mostly at suspend under RTNL.
+	 * We are only canceling software scan, or deferred scan that was not
+	 * yet really started (see __ieee80211_start_scan ).
+	 *
+	 * Regarding hardware scan:
+	 * - we can not call  __ieee80211_scan_completed() as when
+	 *   SCAN_HW_SCANNING bit is set this function change
+	 *   local->hw_scan_req to operate on 5G band, what race with
+	 *   driver which can use local->hw_scan_req
+	 *
+	 * - we can not cancel scan_work since driver can schedule it
+	 *   by ieee80211_scan_completed(..., true) to finish scan
+	 *
+	 * Hence low lever driver is responsible for canceling HW scan.
 	 */
-	mutex_lock(&local->scan_mtx);
-	abortscan = test_bit(SCAN_SW_SCANNING, &local->scanning) ||
-		    (!local->scanning && local->scan_req);
-	mutex_unlock(&local->scan_mtx);
 
+	mutex_lock(&local->mtx);
+	abortscan = local->scan_req && !test_bit(SCAN_HW_SCANNING, &local->scanning);
 	if (abortscan)
-		ieee80211_scan_completed(&local->hw, true);
+		finish = __ieee80211_scan_completed(&local->hw, true, false);
+	mutex_unlock(&local->mtx);
+
+	if (abortscan) {
+		/* The scan is canceled, but stop work from being pending */
+		cancel_delayed_work_sync(&local->scan_work);
+	}
+	if (finish)
+		__ieee80211_scan_completed_finish(&local->hw, false);
 }
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index ba9360a475b..6d8f897d876 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -125,7 +125,7 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
 				    lockdep_is_held(&local->sta_mtx));
 	while (sta) {
 		if ((sta->sdata == sdata ||
-		     sta->sdata->bss == sdata->bss) &&
+		     (sta->sdata->bss && sta->sdata->bss == sdata->bss)) &&
 		    memcmp(sta->sta.addr, addr, ETH_ALEN) == 0)
 			break;
 		sta = rcu_dereference_check(sta->hnext,
@@ -174,8 +174,7 @@ static void __sta_info_free(struct ieee80211_local *local,
 	}
 
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
-	printk(KERN_DEBUG "%s: Destroyed STA %pM\n",
-	       wiphy_name(local->hw.wiphy), sta->sta.addr);
+	wiphy_debug(local->hw.wiphy, "Destroyed STA %pM\n", sta->sta.addr);
 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
 
 	kfree(sta);
@@ -235,6 +234,8 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 	spin_lock_init(&sta->lock);
 	spin_lock_init(&sta->flaglock);
 	INIT_WORK(&sta->drv_unblock_wk, sta_unblock);
+	INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work);
+	mutex_init(&sta->ampdu_mlme.mtx);
 
 	memcpy(sta->sta.addr, addr, ETH_ALEN);
 	sta->local = local;
@@ -246,14 +247,12 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 	}
 
 	for (i = 0; i < STA_TID_NUM; i++) {
-		/* timer_to_tid must be initialized with identity mapping to
-		 * enable session_timer's data differentiation. refer to
-		 * sta_rx_agg_session_timer_expired for useage */
+		/*
+		 * timer_to_tid must be initialized with identity mapping
+		 * to enable session_timer's data differentiation. See
+		 * sta_rx_agg_session_timer_expired for usage.
+		 */
 		sta->timer_to_tid[i] = i;
-		/* tx */
-		sta->ampdu_mlme.tid_state_tx[i] = HT_AGG_STATE_IDLE;
-		sta->ampdu_mlme.tid_tx[i] = NULL;
-		sta->ampdu_mlme.addba_req_num[i] = 0;
 	}
 	skb_queue_head_init(&sta->ps_tx_buf);
 	skb_queue_head_init(&sta->tx_filtered);
@@ -262,8 +261,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 		sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX);
 
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
-	printk(KERN_DEBUG "%s: Allocated STA %pM\n",
-	       wiphy_name(local->hw.wiphy), sta->sta.addr);
+	wiphy_debug(local->hw.wiphy, "Allocated STA %pM\n", sta->sta.addr);
 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
 
 #ifdef CONFIG_MAC80211_MESH
@@ -282,7 +280,7 @@ static int sta_info_finish_insert(struct sta_info *sta, bool async)
 	unsigned long flags;
 	int err = 0;
 
-	WARN_ON(!mutex_is_locked(&local->sta_mtx));
+	lockdep_assert_held(&local->sta_mtx);
 
 	/* notify driver */
 	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
@@ -300,8 +298,9 @@ static int sta_info_finish_insert(struct sta_info *sta, bool async)
 		sta->uploaded = true;
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 		if (async)
-			printk(KERN_DEBUG "%s: Finished adding IBSS STA %pM\n",
-			       wiphy_name(local->hw.wiphy), sta->sta.addr);
+			wiphy_debug(local->hw.wiphy,
+				    "Finished adding IBSS STA %pM\n",
+				    sta->sta.addr);
 #endif
 	}
 
@@ -411,8 +410,8 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU)
 		spin_unlock_irqrestore(&local->sta_lock, flags);
 
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
-		printk(KERN_DEBUG "%s: Added IBSS STA %pM\n",
-		       wiphy_name(local->hw.wiphy), sta->sta.addr);
+		wiphy_debug(local->hw.wiphy, "Added IBSS STA %pM\n",
+			    sta->sta.addr);
 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
 
 		ieee80211_queue_work(&local->hw, &local->sta_finish_work);
@@ -459,8 +458,7 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU)
 	}
 
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
-	printk(KERN_DEBUG "%s: Inserted STA %pM\n",
-	       wiphy_name(local->hw.wiphy), sta->sta.addr);
+	wiphy_debug(local->hw.wiphy, "Inserted STA %pM\n", sta->sta.addr);
 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
 
 	/* move reference to rcu-protected */
@@ -618,7 +616,7 @@ static int __must_check __sta_info_destroy(struct sta_info *sta)
 	struct ieee80211_sub_if_data *sdata;
 	struct sk_buff *skb;
 	unsigned long flags;
-	int ret;
+	int ret, i;
 
 	might_sleep();
 
@@ -635,7 +633,7 @@ static int __must_check __sta_info_destroy(struct sta_info *sta)
 	 * will be sufficient.
 	 */
 	set_sta_flags(sta, WLAN_STA_BLOCK_BA);
-	ieee80211_sta_tear_down_BA_sessions(sta);
+	ieee80211_sta_tear_down_BA_sessions(sta, true);
 
 	spin_lock_irqsave(&local->sta_lock, flags);
 	ret = sta_info_hash_del(local, sta);
@@ -646,18 +644,10 @@ static int __must_check __sta_info_destroy(struct sta_info *sta)
 	if (ret)
 		return ret;
 
-	if (sta->key) {
-		ieee80211_key_free(sta->key);
-		/*
-		 * We have only unlinked the key, and actually destroying it
-		 * may mean it is removed from hardware which requires that
-		 * the key->sta pointer is still valid, so flush the key todo
-		 * list here.
-		 */
-		ieee80211_key_todo();
-
-		WARN_ON(sta->key);
-	}
+	for (i = 0; i < NUM_DEFAULT_KEYS; i++)
+		ieee80211_key_free(local, sta->gtk[i]);
+	if (sta->ptk)
+		ieee80211_key_free(local, sta->ptk);
 
 	sta->dead = true;
 
@@ -698,8 +688,7 @@ static int __must_check __sta_info_destroy(struct sta_info *sta)
 #endif
 
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
-	printk(KERN_DEBUG "%s: Removed STA %pM\n",
-	       wiphy_name(local->hw.wiphy), sta->sta.addr);
+	wiphy_debug(local->hw.wiphy, "Removed STA %pM\n", sta->sta.addr);
 #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */
 	cancel_work_sync(&sta->drv_unblock_wk);
 
@@ -849,13 +838,20 @@ void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
 	mutex_unlock(&local->sta_mtx);
 }
 
-struct ieee80211_sta *ieee80211_find_sta_by_hw(struct ieee80211_hw *hw,
-					       const u8 *addr)
+struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw,
+					       const u8 *addr,
+					       const u8 *localaddr)
 {
 	struct sta_info *sta, *nxt;
 
-	/* Just return a random station ... first in list ... */
+	/*
+	 * Just return a random station if localaddr is NULL
+	 * ... first in list.
+	 */
 	for_each_sta_info(hw_to_local(hw), addr, sta, nxt) {
+		if (localaddr &&
+		    compare_ether_addr(sta->sdata->vif.addr, localaddr) != 0)
+			continue;
 		if (!sta->uploaded)
 			return NULL;
 		return &sta->sta;
@@ -863,7 +859,7 @@ struct ieee80211_sta *ieee80211_find_sta_by_hw(struct ieee80211_hw *hw,
 
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_hw);
+EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr);
 
 struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif,
 					 const u8 *addr)
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index df9d45544ca..9265acadef3 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -42,9 +42,6 @@
  *	be in the queues
  * @WLAN_STA_PSPOLL: Station sent PS-poll while driver was keeping
  *	station in power-save mode, reply when the driver unblocks.
- * @WLAN_STA_DISASSOC: Disassociation in progress.
- *	This is used to reject TX BA session requests when disassociation
- *	is in progress.
  */
 enum ieee80211_sta_info_flags {
 	WLAN_STA_AUTH		= 1<<0,
@@ -60,38 +57,46 @@ enum ieee80211_sta_info_flags {
 	WLAN_STA_BLOCK_BA	= 1<<11,
 	WLAN_STA_PS_DRIVER	= 1<<12,
 	WLAN_STA_PSPOLL		= 1<<13,
-	WLAN_STA_DISASSOC       = 1<<14,
 };
 
 #define STA_TID_NUM 16
 #define ADDBA_RESP_INTERVAL HZ
-#define HT_AGG_MAX_RETRIES		(0x3)
+#define HT_AGG_MAX_RETRIES		0x3
 
-#define HT_AGG_STATE_INITIATOR_SHIFT	(4)
-
-#define HT_ADDBA_REQUESTED_MSK		BIT(0)
-#define HT_ADDBA_DRV_READY_MSK		BIT(1)
-#define HT_ADDBA_RECEIVED_MSK		BIT(2)
-#define HT_AGG_STATE_REQ_STOP_BA_MSK	BIT(3)
-#define HT_AGG_STATE_INITIATOR_MSK      BIT(HT_AGG_STATE_INITIATOR_SHIFT)
-#define HT_AGG_STATE_IDLE		(0x0)
-#define HT_AGG_STATE_OPERATIONAL	(HT_ADDBA_REQUESTED_MSK |	\
-					 HT_ADDBA_DRV_READY_MSK |	\
-					 HT_ADDBA_RECEIVED_MSK)
+#define HT_AGG_STATE_DRV_READY		0
+#define HT_AGG_STATE_RESPONSE_RECEIVED	1
+#define HT_AGG_STATE_OPERATIONAL	2
+#define HT_AGG_STATE_STOPPING		3
+#define HT_AGG_STATE_WANT_START		4
+#define HT_AGG_STATE_WANT_STOP		5
 
 /**
  * struct tid_ampdu_tx - TID aggregation information (Tx).
  *
+ * @rcu_head: rcu head for freeing structure
  * @addba_resp_timer: timer for peer's response to addba request
  * @pending: pending frames queue -- use sta's spinlock to protect
- * @ssn: Starting Sequence Number expected to be aggregated.
  * @dialog_token: dialog token for aggregation session
+ * @state: session state (see above)
+ * @stop_initiator: initiator of a session stop
+ * @tx_stop: TX DelBA frame when stopping
+ *
+ * This structure is protected by RCU and the per-station
+ * spinlock. Assignments to the array holding it must hold
+ * the spinlock, only the TX path can access it under RCU
+ * lock-free if, and only if, the state has  the flag
+ * %HT_AGG_STATE_OPERATIONAL set. Otherwise, the TX path
+ * must also acquire the spinlock and re-check the state,
+ * see comments in the tx code touching it.
  */
 struct tid_ampdu_tx {
+	struct rcu_head rcu_head;
 	struct timer_list addba_resp_timer;
 	struct sk_buff_head pending;
-	u16 ssn;
+	unsigned long state;
 	u8 dialog_token;
+	u8 stop_initiator;
+	bool tx_stop;
 };
 
 /**
@@ -100,17 +105,33 @@ struct tid_ampdu_tx {
  * @reorder_buf: buffer to reorder incoming aggregated MPDUs
  * @reorder_time: jiffies when skb was added
  * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value)
+ * @reorder_timer: releases expired frames from the reorder buffer.
  * @head_seq_num: head sequence number in reordering buffer.
  * @stored_mpdu_num: number of MPDUs in reordering buffer
  * @ssn: Starting Sequence Number expected to be aggregated.
  * @buf_size: buffer size for incoming A-MPDUs
  * @timeout: reset timer value (in TUs).
  * @dialog_token: dialog token for aggregation session
+ * @rcu_head: RCU head used for freeing this struct
+ * @reorder_lock: serializes access to reorder buffer, see below.
+ *
+ * This structure is protected by RCU and the per-station
+ * spinlock. Assignments to the array holding it must hold
+ * the spinlock.
+ *
+ * The @reorder_lock is used to protect the variables and
+ * arrays such as @reorder_buf, @reorder_time, @head_seq_num,
+ * @stored_mpdu_num and @reorder_time from being corrupted by
+ * concurrent access of the RX path and the expired frame
+ * release timer.
  */
 struct tid_ampdu_rx {
+	struct rcu_head rcu_head;
+	spinlock_t reorder_lock;
 	struct sk_buff **reorder_buf;
 	unsigned long *reorder_time;
 	struct timer_list session_timer;
+	struct timer_list reorder_timer;
 	u16 head_seq_num;
 	u16 stored_mpdu_num;
 	u16 ssn;
@@ -120,6 +141,32 @@ struct tid_ampdu_rx {
 };
 
 /**
+ * struct sta_ampdu_mlme - STA aggregation information.
+ *
+ * @tid_rx: aggregation info for Rx per TID -- RCU protected
+ * @tid_tx: aggregation info for Tx per TID
+ * @addba_req_num: number of times addBA request has been sent.
+ * @dialog_token_allocator: dialog token enumerator for each new session;
+ * @work: work struct for starting/stopping aggregation
+ * @tid_rx_timer_expired: bitmap indicating on which TIDs the
+ *	RX timer expired until the work for it runs
+ * @mtx: mutex to protect all TX data (except non-NULL assignments
+ *	to tid_tx[idx], which are protected by the sta spinlock)
+ */
+struct sta_ampdu_mlme {
+	struct mutex mtx;
+	/* rx */
+	struct tid_ampdu_rx *tid_rx[STA_TID_NUM];
+	unsigned long tid_rx_timer_expired[BITS_TO_LONGS(STA_TID_NUM)];
+	/* tx */
+	struct work_struct work;
+	struct tid_ampdu_tx *tid_tx[STA_TID_NUM];
+	u8 addba_req_num[STA_TID_NUM];
+	u8 dialog_token_allocator;
+};
+
+
+/**
  * enum plink_state - state of a mesh peer link finite state machine
  *
  * @PLINK_LISTEN: initial state, considered the implicit state of non existant
@@ -143,28 +190,6 @@ enum plink_state {
 };
 
 /**
- * struct sta_ampdu_mlme - STA aggregation information.
- *
- * @tid_active_rx: TID's state in Rx session state machine.
- * @tid_rx: aggregation info for Rx per TID
- * @tid_state_tx: TID's state in Tx session state machine.
- * @tid_tx: aggregation info for Tx per TID
- * @addba_req_num: number of times addBA request has been sent.
- * @dialog_token_allocator: dialog token enumerator for each new session;
- */
-struct sta_ampdu_mlme {
-	/* rx */
-	bool tid_active_rx[STA_TID_NUM];
-	struct tid_ampdu_rx *tid_rx[STA_TID_NUM];
-	/* tx */
-	u8 tid_state_tx[STA_TID_NUM];
-	struct tid_ampdu_tx *tid_tx[STA_TID_NUM];
-	u8 addba_req_num[STA_TID_NUM];
-	u8 dialog_token_allocator;
-};
-
-
-/**
  * struct sta_info - STA information
  *
  * This structure collects information about a station that
@@ -174,7 +199,8 @@ struct sta_ampdu_mlme {
  * @hnext: hash table linked list pointer
  * @local: pointer to the global information
  * @sdata: virtual interface this station belongs to
- * @key: peer key negotiated with this station, if any
+ * @ptk: peer key negotiated with this station, if any
+ * @gtk: group keys negotiated with this station, if any
  * @rate_ctrl: rate control algorithm reference
  * @rate_ctrl_priv: rate control private per-STA pointer
  * @last_tx_rate: rate used for last transmit, to report to userspace as
@@ -229,7 +255,8 @@ struct sta_info {
 	struct sta_info *hnext;
 	struct ieee80211_local *local;
 	struct ieee80211_sub_if_data *sdata;
-	struct ieee80211_key *key;
+	struct ieee80211_key *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS];
+	struct ieee80211_key *ptk;
 	struct rate_control_ref *rate_ctrl;
 	void *rate_ctrl_priv;
 	spinlock_t lock;
@@ -410,20 +437,20 @@ void for_each_sta_info_type_check(struct ieee80211_local *local,
 {
 }
 
-#define for_each_sta_info(local, _addr, sta, nxt) 			\
+#define for_each_sta_info(local, _addr, _sta, nxt) 			\
 	for (	/* initialise loop */					\
-		sta = rcu_dereference(local->sta_hash[STA_HASH(_addr)]),\
-		nxt = sta ? rcu_dereference(sta->hnext) : NULL;		\
+		_sta = rcu_dereference(local->sta_hash[STA_HASH(_addr)]),\
+		nxt = _sta ? rcu_dereference(_sta->hnext) : NULL;	\
 		/* typecheck */						\
-		for_each_sta_info_type_check(local, (_addr), sta, nxt),	\
+		for_each_sta_info_type_check(local, (_addr), _sta, nxt),\
 		/* continue condition */				\
-		sta;							\
+		_sta;							\
 		/* advance loop */					\
-		sta = nxt,						\
-		nxt = sta ? rcu_dereference(sta->hnext) : NULL		\
+		_sta = nxt,						\
+		nxt = _sta ? rcu_dereference(_sta->hnext) : NULL	\
 	     )								\
 	/* compare address and run code only if it matches */		\
-	if (memcmp(sta->sta.addr, (_addr), ETH_ALEN) == 0)
+	if (memcmp(_sta->sta.addr, (_addr), ETH_ALEN) == 0)
 
 /*
  * Get STA info by index, BROKEN!
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 94613af009f..3153c19893b 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -47,7 +47,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
 	/*
 	 * This skb 'survived' a round-trip through the driver, and
 	 * hopefully the driver didn't mangle it too badly. However,
-	 * we can definitely not rely on the the control information
+	 * we can definitely not rely on the control information
 	 * being correct. Clear it so we don't get junk there, and
 	 * indicate that it needs new processing, but must not be
 	 * modified/encrypted again.
@@ -58,6 +58,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
 	info->control.vif = &sta->sdata->vif;
 	info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING |
 		       IEEE80211_TX_INTFL_RETRANSMISSION;
+	info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
 
 	sta->tx_filtered_count++;
 
@@ -114,11 +115,10 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
 
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 	if (net_ratelimit())
-		printk(KERN_DEBUG "%s: dropped TX filtered frame, "
-		       "queue_len=%d PS=%d @%lu\n",
-		       wiphy_name(local->hw.wiphy),
-		       skb_queue_len(&sta->tx_filtered),
-		       !!test_sta_flags(sta, WLAN_STA_PS_STA), jiffies);
+		wiphy_debug(local->hw.wiphy,
+			    "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n",
+			    skb_queue_len(&sta->tx_filtered),
+			    !!test_sta_flags(sta, WLAN_STA_PS_STA), jiffies);
 #endif
 	dev_kfree_skb(skb);
 }
@@ -176,7 +176,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 
 	for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
 		/* the HW cannot have attempted that rate */
-		if (i >= hw->max_rates) {
+		if (i >= hw->max_report_rates) {
 			info->status.rates[i].idx = -1;
 			info->status.rates[i].count = 0;
 		} else if (info->status.rates[i].idx >= 0) {
@@ -296,7 +296,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 	}
 
 	if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX)
-		cfg80211_action_tx_status(
+		cfg80211_mgmt_tx_status(
 			skb->dev, (unsigned long) skb, skb->data, skb->len,
 			!!(info->flags & IEEE80211_TX_STAT_ACK), GFP_ATOMIC);
 
diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c
index 7ef491e9d66..e840c9cd46d 100644
--- a/net/mac80211/tkip.c
+++ b/net/mac80211/tkip.c
@@ -202,9 +202,9 @@ EXPORT_SYMBOL(ieee80211_get_tkip_key);
  * @payload_len is the length of payload (_not_ including IV/ICV length).
  * @ta is the transmitter addresses.
  */
-void ieee80211_tkip_encrypt_data(struct crypto_blkcipher *tfm,
-				 struct ieee80211_key *key,
-				 u8 *pos, size_t payload_len, u8 *ta)
+int ieee80211_tkip_encrypt_data(struct crypto_blkcipher *tfm,
+				struct ieee80211_key *key,
+				u8 *pos, size_t payload_len, u8 *ta)
 {
 	u8 rc4key[16];
 	struct tkip_ctx *ctx = &key->u.tkip.tx;
@@ -216,7 +216,7 @@ void ieee80211_tkip_encrypt_data(struct crypto_blkcipher *tfm,
 
 	tkip_mixing_phase2(tk, ctx, ctx->iv16, rc4key);
 
-	ieee80211_wep_encrypt_data(tfm, rc4key, 16, pos, payload_len);
+	return ieee80211_wep_encrypt_data(tfm, rc4key, 16, pos, payload_len);
 }
 
 /* Decrypt packet payload with TKIP using @key. @pos is a pointer to the
diff --git a/net/mac80211/tkip.h b/net/mac80211/tkip.h
index d4714383f5f..7e83dee976f 100644
--- a/net/mac80211/tkip.h
+++ b/net/mac80211/tkip.h
@@ -15,7 +15,7 @@
 
 u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key, u16 iv16);
 
-void ieee80211_tkip_encrypt_data(struct crypto_blkcipher *tfm,
+int ieee80211_tkip_encrypt_data(struct crypto_blkcipher *tfm,
 				 struct ieee80211_key *key,
 				 u8 *pos, size_t payload_len, u8 *ta);
 enum {
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 680bcb7093d..96c59430950 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -273,6 +273,9 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
 		 */
 		return TX_DROP;
 
+	if (tx->sdata->vif.type == NL80211_IFTYPE_WDS)
+		return TX_CONTINUE;
+
 	if (tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
 		return TX_CONTINUE;
 
@@ -351,8 +354,8 @@ static void purge_old_ps_buffers(struct ieee80211_local *local)
 
 	local->total_ps_buffered = total;
 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
-	printk(KERN_DEBUG "%s: PS buffers full - purged %d frames\n",
-	       wiphy_name(local->hw.wiphy), purged);
+	wiphy_debug(local->hw.wiphy, "PS buffers full - purged %d frames\n",
+		    purged);
 #endif
 }
 
@@ -509,6 +512,18 @@ ieee80211_tx_h_ps_buf(struct ieee80211_tx_data *tx)
 }
 
 static ieee80211_tx_result debug_noinline
+ieee80211_tx_h_check_control_port_protocol(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+
+	if (unlikely(tx->sdata->control_port_protocol == tx->skb->protocol &&
+		     tx->sdata->control_port_no_encrypt))
+		info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+
+	return TX_CONTINUE;
+}
+
+static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_key *key = NULL;
@@ -517,7 +532,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
 		tx->key = NULL;
-	else if (tx->sta && (key = rcu_dereference(tx->sta->key)))
+	else if (tx->sta && (key = rcu_dereference(tx->sta->ptk)))
 		tx->key = key;
 	else if (ieee80211_is_mgmt(hdr->frame_control) &&
 		 is_multicast_ether_addr(hdr->addr1) &&
@@ -527,7 +542,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 	else if ((key = rcu_dereference(tx->sdata->default_key)))
 		tx->key = key;
 	else if (tx->sdata->drop_unencrypted &&
-		 (tx->skb->protocol != cpu_to_be16(ETH_P_PAE)) &&
+		 (tx->skb->protocol != tx->sdata->control_port_protocol) &&
 		 !(info->flags & IEEE80211_TX_CTL_INJECTED) &&
 		 (!ieee80211_is_robust_mgmt_frame(hdr) ||
 		  (ieee80211_is_action(hdr->frame_control) &&
@@ -543,15 +558,16 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 		tx->key->tx_rx_count++;
 		/* TODO: add threshold stuff again */
 
-		switch (tx->key->conf.alg) {
-		case ALG_WEP:
+		switch (tx->key->conf.cipher) {
+		case WLAN_CIPHER_SUITE_WEP40:
+		case WLAN_CIPHER_SUITE_WEP104:
 			if (ieee80211_is_auth(hdr->frame_control))
 				break;
-		case ALG_TKIP:
+		case WLAN_CIPHER_SUITE_TKIP:
 			if (!ieee80211_is_data_present(hdr->frame_control))
 				tx->key = NULL;
 			break;
-		case ALG_CCMP:
+		case WLAN_CIPHER_SUITE_CCMP:
 			if (!ieee80211_is_data_present(hdr->frame_control) &&
 			    !ieee80211_use_mfp(hdr->frame_control, tx->sta,
 					       tx->skb))
@@ -561,7 +577,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 					   IEEE80211_KEY_FLAG_SW_MGMT) &&
 					ieee80211_is_mgmt(hdr->frame_control);
 			break;
-		case ALG_AES_CMAC:
+		case WLAN_CIPHER_SUITE_AES_CMAC:
 			if (!ieee80211_is_mgmt(hdr->frame_control))
 				tx->key = NULL;
 			break;
@@ -576,17 +592,6 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 }
 
 static ieee80211_tx_result debug_noinline
-ieee80211_tx_h_sta(struct ieee80211_tx_data *tx)
-{
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
-
-	if (tx->sta && tx->sta->uploaded)
-		info->control.sta = &tx->sta->sta;
-
-	return TX_CONTINUE;
-}
-
-static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
@@ -957,22 +962,31 @@ ieee80211_tx_h_stats(struct ieee80211_tx_data *tx)
 static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx)
 {
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
+
 	if (!tx->key)
 		return TX_CONTINUE;
 
-	switch (tx->key->conf.alg) {
-	case ALG_WEP:
+	switch (tx->key->conf.cipher) {
+	case WLAN_CIPHER_SUITE_WEP40:
+	case WLAN_CIPHER_SUITE_WEP104:
 		return ieee80211_crypto_wep_encrypt(tx);
-	case ALG_TKIP:
+	case WLAN_CIPHER_SUITE_TKIP:
 		return ieee80211_crypto_tkip_encrypt(tx);
-	case ALG_CCMP:
+	case WLAN_CIPHER_SUITE_CCMP:
 		return ieee80211_crypto_ccmp_encrypt(tx);
-	case ALG_AES_CMAC:
+	case WLAN_CIPHER_SUITE_AES_CMAC:
 		return ieee80211_crypto_aes_cmac_encrypt(tx);
+	default:
+		/* handle hw-only algorithm */
+		if (info->control.hw_key) {
+			ieee80211_tx_set_protected(tx);
+			return TX_CONTINUE;
+		}
+		break;
+
 	}
 
-	/* not reached */
-	WARN_ON(1);
 	return TX_DROP;
 }
 
@@ -1092,6 +1106,59 @@ static bool __ieee80211_parse_tx_radiotap(struct ieee80211_tx_data *tx,
 	return true;
 }
 
+static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx,
+				  struct sk_buff *skb,
+				  struct ieee80211_tx_info *info,
+				  struct tid_ampdu_tx *tid_tx,
+				  int tid)
+{
+	bool queued = false;
+
+	if (test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) {
+		info->flags |= IEEE80211_TX_CTL_AMPDU;
+	} else if (test_bit(HT_AGG_STATE_WANT_START, &tid_tx->state)) {
+		/*
+		 * nothing -- this aggregation session is being started
+		 * but that might still fail with the driver
+		 */
+	} else {
+		spin_lock(&tx->sta->lock);
+		/*
+		 * Need to re-check now, because we may get here
+		 *
+		 *  1) in the window during which the setup is actually
+		 *     already done, but not marked yet because not all
+		 *     packets are spliced over to the driver pending
+		 *     queue yet -- if this happened we acquire the lock
+		 *     either before or after the splice happens, but
+		 *     need to recheck which of these cases happened.
+		 *
+		 *  2) during session teardown, if the OPERATIONAL bit
+		 *     was cleared due to the teardown but the pointer
+		 *     hasn't been assigned NULL yet (or we loaded it
+		 *     before it was assigned) -- in this case it may
+		 *     now be NULL which means we should just let the
+		 *     packet pass through because splicing the frames
+		 *     back is already done.
+		 */
+		tid_tx = tx->sta->ampdu_mlme.tid_tx[tid];
+
+		if (!tid_tx) {
+			/* do nothing, let packet pass through */
+		} else if (test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) {
+			info->flags |= IEEE80211_TX_CTL_AMPDU;
+		} else {
+			queued = true;
+			info->control.vif = &tx->sdata->vif;
+			info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
+			__skb_queue_tail(&tid_tx->pending, skb);
+		}
+		spin_unlock(&tx->sta->lock);
+	}
+
+	return queued;
+}
+
 /*
  * initialises @tx
  */
@@ -1104,8 +1171,7 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_hdr *hdr;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	int hdrlen, tid;
-	u8 *qc, *state;
-	bool queued = false;
+	u8 *qc;
 
 	memset(tx, 0, sizeof(*tx));
 	tx->skb = skb;
@@ -1157,35 +1223,16 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
 		qc = ieee80211_get_qos_ctl(hdr);
 		tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
 
-		spin_lock(&tx->sta->lock);
-		/*
-		 * XXX: This spinlock could be fairly expensive, but see the
-		 *	comment in agg-tx.c:ieee80211_agg_tx_operational().
-		 *	One way to solve this would be to do something RCU-like
-		 *	for managing the tid_tx struct and using atomic bitops
-		 *	for the actual state -- by introducing an actual
-		 *	'operational' bit that would be possible. It would
-		 *	require changing ieee80211_agg_tx_operational() to
-		 *	set that bit, and changing the way tid_tx is managed
-		 *	everywhere, including races between that bit and
-		 *	tid_tx going away (tid_tx being added can be easily
-		 *	committed to memory before the 'operational' bit).
-		 */
-		tid_tx = tx->sta->ampdu_mlme.tid_tx[tid];
-		state = &tx->sta->ampdu_mlme.tid_state_tx[tid];
-		if (*state == HT_AGG_STATE_OPERATIONAL) {
-			info->flags |= IEEE80211_TX_CTL_AMPDU;
-		} else if (*state != HT_AGG_STATE_IDLE) {
-			/* in progress */
-			queued = true;
-			info->control.vif = &sdata->vif;
-			info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
-			__skb_queue_tail(&tid_tx->pending, skb);
-		}
-		spin_unlock(&tx->sta->lock);
+		tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]);
+		if (tid_tx) {
+			bool queued;
+
+			queued = ieee80211_tx_prep_agg(tx, skb, info,
+						       tid_tx, tid);
 
-		if (unlikely(queued))
-			return TX_QUEUED;
+			if (unlikely(queued))
+				return TX_QUEUED;
+		}
 	}
 
 	if (is_multicast_ether_addr(hdr->addr1)) {
@@ -1274,6 +1321,11 @@ static int __ieee80211_tx(struct ieee80211_local *local,
 			break;
 		}
 
+		if (sta && sta->uploaded)
+			info->control.sta = &sta->sta;
+		else
+			info->control.sta = NULL;
+
 		ret = drv_tx(local, skb);
 		if (WARN_ON(ret != NETDEV_TX_OK && skb->len != len)) {
 			dev_kfree_skb(skb);
@@ -1312,8 +1364,8 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	CALL_TXH(ieee80211_tx_h_dynamic_ps);
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
+	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
 	CALL_TXH(ieee80211_tx_h_select_key);
-	CALL_TXH(ieee80211_tx_h_sta);
 	if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
 
@@ -1485,8 +1537,8 @@ static int ieee80211_skb_resize(struct ieee80211_local *local,
 		I802_DEBUG_INC(local->tx_expand_skb_head);
 
 	if (pskb_expand_head(skb, head_need, tail_need, GFP_ATOMIC)) {
-		printk(KERN_DEBUG "%s: failed to reallocate TX buffer\n",
-		       wiphy_name(local->hw.wiphy));
+		wiphy_debug(local->hw.wiphy,
+			    "failed to reallocate TX buffer\n");
 		return -ENOMEM;
 	}
 
@@ -1560,6 +1612,7 @@ static void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
 		return;
 	}
 
+	hdr = (struct ieee80211_hdr *) skb->data;
 	info->control.vif = &sdata->vif;
 
 	if (ieee80211_vif_is_mesh(&sdata->vif) &&
@@ -1673,7 +1726,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
 	u16 ethertype, hdrlen,  meshhdrlen = 0;
 	__le16 fc;
 	struct ieee80211_hdr hdr;
-	struct ieee80211s_hdr mesh_hdr;
+	struct ieee80211s_hdr mesh_hdr __maybe_unused;
 	const u8 *encaps_data;
 	int encaps_len, skip_header_bytes;
 	int nh_pos, h_pos;
@@ -1790,7 +1843,8 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
 #endif
 	case NL80211_IFTYPE_STATION:
 		memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN);
-		if (sdata->u.mgd.use_4addr && ethertype != ETH_P_PAE) {
+		if (sdata->u.mgd.use_4addr &&
+		    cpu_to_be16(ethertype) != sdata->control_port_protocol) {
 			fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS);
 			/* RA TA DA SA */
 			memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN);
@@ -1843,7 +1897,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
 	if (!ieee80211_vif_is_mesh(&sdata->vif) &&
 		unlikely(!is_multicast_ether_addr(hdr.addr1) &&
 		      !(sta_flags & WLAN_STA_AUTHORIZED) &&
-		      !(ethertype == ETH_P_PAE &&
+		      !(cpu_to_be16(ethertype) == sdata->control_port_protocol &&
 		       compare_ether_addr(sdata->vif.addr,
 					  skb->data + ETH_ALEN) == 0))) {
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
@@ -1909,11 +1963,13 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
 		h_pos += encaps_len;
 	}
 
+#ifdef CONFIG_MAC80211_MESH
 	if (meshhdrlen > 0) {
 		memcpy(skb_push(skb, meshhdrlen), &mesh_hdr, meshhdrlen);
 		nh_pos += meshhdrlen;
 		h_pos += meshhdrlen;
 	}
+#endif
 
 	if (ieee80211_is_data_qos(fc)) {
 		__le16 *qos_control;
@@ -2040,8 +2096,7 @@ void ieee80211_tx_pending(unsigned long data)
 
 		if (skb_queue_empty(&local->pending[i]))
 			list_for_each_entry_rcu(sdata, &local->interfaces, list)
-				netif_tx_wake_queue(
-					netdev_get_tx_queue(sdata->dev, i));
+				netif_wake_subqueue(sdata->dev, i);
 	}
 	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 5b79d552780..0b6fc92bc0d 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -283,8 +283,11 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
 
 	if (skb_queue_empty(&local->pending[queue])) {
 		rcu_read_lock();
-		list_for_each_entry_rcu(sdata, &local->interfaces, list)
-			netif_tx_wake_queue(netdev_get_tx_queue(sdata->dev, queue));
+		list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+			if (test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state))
+				continue;
+			netif_wake_subqueue(sdata->dev, queue);
+		}
 		rcu_read_unlock();
 	} else
 		tasklet_schedule(&local->tx_pending_tasklet);
@@ -323,7 +326,7 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdata, &local->interfaces, list)
-		netif_tx_stop_queue(netdev_get_tx_queue(sdata->dev, queue));
+		netif_stop_subqueue(sdata->dev, queue);
 	rcu_read_unlock();
 }
 
@@ -471,16 +474,10 @@ void ieee80211_iterate_active_interfaces(
 
 	list_for_each_entry(sdata, &local->interfaces, list) {
 		switch (sdata->vif.type) {
-		case __NL80211_IFTYPE_AFTER_LAST:
-		case NL80211_IFTYPE_UNSPECIFIED:
 		case NL80211_IFTYPE_MONITOR:
 		case NL80211_IFTYPE_AP_VLAN:
 			continue;
-		case NL80211_IFTYPE_AP:
-		case NL80211_IFTYPE_STATION:
-		case NL80211_IFTYPE_ADHOC:
-		case NL80211_IFTYPE_WDS:
-		case NL80211_IFTYPE_MESH_POINT:
+		default:
 			break;
 		}
 		if (ieee80211_sdata_running(sdata))
@@ -505,16 +502,10 @@ void ieee80211_iterate_active_interfaces_atomic(
 
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
 		switch (sdata->vif.type) {
-		case __NL80211_IFTYPE_AFTER_LAST:
-		case NL80211_IFTYPE_UNSPECIFIED:
 		case NL80211_IFTYPE_MONITOR:
 		case NL80211_IFTYPE_AP_VLAN:
 			continue;
-		case NL80211_IFTYPE_AP:
-		case NL80211_IFTYPE_STATION:
-		case NL80211_IFTYPE_ADHOC:
-		case NL80211_IFTYPE_WDS:
-		case NL80211_IFTYPE_MESH_POINT:
+		default:
 			break;
 		}
 		if (ieee80211_sdata_running(sdata))
@@ -803,8 +794,12 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata)
 
 	/* after reinitialize QoS TX queues setting to default,
 	 * disable QoS at all */
-	local->hw.conf.flags &=	~IEEE80211_CONF_QOS;
-	drv_config(local, IEEE80211_CONF_CHANGE_QOS);
+
+	if (sdata->vif.type != NL80211_IFTYPE_MONITOR) {
+		sdata->vif.bss_conf.qos =
+			sdata->vif.type != NL80211_IFTYPE_STATION;
+		ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_QOS);
+	}
 }
 
 void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
@@ -900,26 +895,34 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 
 int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
 			     const u8 *ie, size_t ie_len,
-			     enum ieee80211_band band)
+			     enum ieee80211_band band, u32 rate_mask,
+			     u8 channel)
 {
 	struct ieee80211_supported_band *sband;
 	u8 *pos;
 	size_t offset = 0, noffset;
 	int supp_rates_len, i;
+	u8 rates[32];
+	int num_rates;
+	int ext_rates_len;
 
 	sband = local->hw.wiphy->bands[band];
 
 	pos = buffer;
 
-	supp_rates_len = min_t(int, sband->n_bitrates, 8);
+	num_rates = 0;
+	for (i = 0; i < sband->n_bitrates; i++) {
+		if ((BIT(i) & rate_mask) == 0)
+			continue; /* skip rate */
+		rates[num_rates++] = (u8) (sband->bitrates[i].bitrate / 5);
+	}
+
+	supp_rates_len = min_t(int, num_rates, 8);
 
 	*pos++ = WLAN_EID_SUPP_RATES;
 	*pos++ = supp_rates_len;
-
-	for (i = 0; i < supp_rates_len; i++) {
-		int rate = sband->bitrates[i].bitrate;
-		*pos++ = (u8) (rate / 5);
-	}
+	memcpy(pos, rates, supp_rates_len);
+	pos += supp_rates_len;
 
 	/* insert "request information" if in custom IEs */
 	if (ie && ie_len) {
@@ -937,14 +940,18 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer,
 		offset = noffset;
 	}
 
-	if (sband->n_bitrates > i) {
+	ext_rates_len = num_rates - supp_rates_len;
+	if (ext_rates_len > 0) {
 		*pos++ = WLAN_EID_EXT_SUPP_RATES;
-		*pos++ = sband->n_bitrates - i;
+		*pos++ = ext_rates_len;
+		memcpy(pos, rates + supp_rates_len, ext_rates_len);
+		pos += ext_rates_len;
+	}
 
-		for (; i < sband->n_bitrates; i++) {
-			int rate = sband->bitrates[i].bitrate;
-			*pos++ = (u8) (rate / 5);
-		}
+	if (channel && sband->band == IEEE80211_BAND_2GHZ) {
+		*pos++ = WLAN_EID_DS_PARAMS;
+		*pos++ = 1;
+		*pos++ = channel;
 	}
 
 	/* insert custom IEs that go before HT */
@@ -1013,6 +1020,7 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 	struct ieee80211_mgmt *mgmt;
 	size_t buf_len;
 	u8 *buf;
+	u8 chan;
 
 	/* FIXME: come up with a proper value */
 	buf = kmalloc(200 + ie_len, GFP_KERNEL);
@@ -1022,8 +1030,14 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
 		return;
 	}
 
+	chan = ieee80211_frequency_to_channel(
+		local->hw.conf.channel->center_freq);
+
 	buf_len = ieee80211_build_preq_ies(local, buf, ie, ie_len,
-					   local->hw.conf.channel->band);
+					   local->hw.conf.channel->band,
+					   sdata->rc_rateidx_mask
+					   [local->hw.conf.channel->band],
+					   chan);
 
 	skb = ieee80211_probereq_get(&local->hw, &sdata->vif,
 				     ssid, ssid_len,
@@ -1138,18 +1152,6 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 	}
 	mutex_unlock(&local->sta_mtx);
 
-	/* Clear Suspend state so that ADDBA requests can be processed */
-
-	rcu_read_lock();
-
-	if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) {
-		list_for_each_entry_rcu(sta, &local->sta_list, list) {
-			clear_sta_flags(sta, WLAN_STA_BLOCK_BA);
-		}
-	}
-
-	rcu_read_unlock();
-
 	/* setup RTS threshold */
 	drv_set_rts_threshold(local, hw->wiphy->rts_threshold);
 
@@ -1173,7 +1175,8 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 			  BSS_CHANGED_BASIC_RATES |
 			  BSS_CHANGED_BEACON_INT |
 			  BSS_CHANGED_BSSID |
-			  BSS_CHANGED_CQM;
+			  BSS_CHANGED_CQM |
+			  BSS_CHANGED_QOS;
 
 		switch (sdata->vif.type) {
 		case NL80211_IFTYPE_STATION:
@@ -1196,19 +1199,34 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 			/* ignore virtual */
 			break;
 		case NL80211_IFTYPE_UNSPECIFIED:
-		case __NL80211_IFTYPE_AFTER_LAST:
+		case NUM_NL80211_IFTYPES:
+		case NL80211_IFTYPE_P2P_CLIENT:
+		case NL80211_IFTYPE_P2P_GO:
 			WARN_ON(1);
 			break;
 		}
 	}
 
-	rcu_read_lock();
+	/*
+	 * Clear the WLAN_STA_BLOCK_BA flag so new aggregation
+	 * sessions can be established after a resume.
+	 *
+	 * Also tear down aggregation sessions since reconfiguring
+	 * them in a hardware restart scenario is not easily done
+	 * right now, and the hardware will have lost information
+	 * about the sessions, but we and the AP still think they
+	 * are active. This is really a workaround though.
+	 */
 	if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) {
-		list_for_each_entry_rcu(sta, &local->sta_list, list) {
-			ieee80211_sta_tear_down_BA_sessions(sta);
+		mutex_lock(&local->sta_mtx);
+
+		list_for_each_entry(sta, &local->sta_list, list) {
+			ieee80211_sta_tear_down_BA_sessions(sta, true);
+			clear_sta_flags(sta, WLAN_STA_BLOCK_BA);
 		}
+
+		mutex_unlock(&local->sta_mtx);
 	}
-	rcu_read_unlock();
 
 	/* add back keys */
 	list_for_each_entry(sdata, &local->interfaces, list)
@@ -1279,17 +1297,13 @@ static int check_mgd_smps(struct ieee80211_if_managed *ifmgd,
 }
 
 /* must hold iflist_mtx */
-void ieee80211_recalc_smps(struct ieee80211_local *local,
-			   struct ieee80211_sub_if_data *forsdata)
+void ieee80211_recalc_smps(struct ieee80211_local *local)
 {
 	struct ieee80211_sub_if_data *sdata;
 	enum ieee80211_smps_mode smps_mode = IEEE80211_SMPS_OFF;
 	int count = 0;
 
-	if (forsdata)
-		WARN_ON(!mutex_is_locked(&forsdata->u.mgd.mtx));
-
-	WARN_ON(!mutex_is_locked(&local->iflist_mtx));
+	lockdep_assert_held(&local->iflist_mtx);
 
 	/*
 	 * This function could be improved to handle multiple
@@ -1302,22 +1316,12 @@ void ieee80211_recalc_smps(struct ieee80211_local *local,
 	 */
 
 	list_for_each_entry(sdata, &local->interfaces, list) {
-		if (!netif_running(sdata->dev))
+		if (!ieee80211_sdata_running(sdata))
 			continue;
 		if (sdata->vif.type != NL80211_IFTYPE_STATION)
 			goto set;
-		if (sdata != forsdata) {
-			/*
-			 * This nested is ok -- we are holding the iflist_mtx
-			 * so can't get here twice or so. But it's required
-			 * since normally we acquire it first and then the
-			 * iflist_mtx.
-			 */
-			mutex_lock_nested(&sdata->u.mgd.mtx, SINGLE_DEPTH_NESTING);
-			count += check_mgd_smps(&sdata->u.mgd, &smps_mode);
-			mutex_unlock(&sdata->u.mgd.mtx);
-		} else
-			count += check_mgd_smps(&sdata->u.mgd, &smps_mode);
+
+		count += check_mgd_smps(&sdata->u.mgd, &smps_mode);
 
 		if (count > 1) {
 			smps_mode = IEEE80211_SMPS_OFF;
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index 5f3a4113bda..2ff6d1e3ed2 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -32,13 +32,16 @@ int ieee80211_wep_init(struct ieee80211_local *local)
 
 	local->wep_tx_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0,
 						CRYPTO_ALG_ASYNC);
-	if (IS_ERR(local->wep_tx_tfm))
+	if (IS_ERR(local->wep_tx_tfm)) {
+		local->wep_rx_tfm = ERR_PTR(-EINVAL);
 		return PTR_ERR(local->wep_tx_tfm);
+	}
 
 	local->wep_rx_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0,
 						CRYPTO_ALG_ASYNC);
 	if (IS_ERR(local->wep_rx_tfm)) {
 		crypto_free_blkcipher(local->wep_tx_tfm);
+		local->wep_tx_tfm = ERR_PTR(-EINVAL);
 		return PTR_ERR(local->wep_rx_tfm);
 	}
 
@@ -47,8 +50,10 @@ int ieee80211_wep_init(struct ieee80211_local *local)
 
 void ieee80211_wep_free(struct ieee80211_local *local)
 {
-	crypto_free_blkcipher(local->wep_tx_tfm);
-	crypto_free_blkcipher(local->wep_rx_tfm);
+	if (!IS_ERR(local->wep_tx_tfm))
+		crypto_free_blkcipher(local->wep_tx_tfm);
+	if (!IS_ERR(local->wep_rx_tfm))
+		crypto_free_blkcipher(local->wep_rx_tfm);
 }
 
 static inline bool ieee80211_wep_weak_iv(u32 iv, int keylen)
@@ -122,19 +127,24 @@ static void ieee80211_wep_remove_iv(struct ieee80211_local *local,
 /* Perform WEP encryption using given key. data buffer must have tailroom
  * for 4-byte ICV. data_len must not include this ICV. Note: this function
  * does _not_ add IV. data = RC4(data | CRC32(data)) */
-void ieee80211_wep_encrypt_data(struct crypto_blkcipher *tfm, u8 *rc4key,
-				size_t klen, u8 *data, size_t data_len)
+int ieee80211_wep_encrypt_data(struct crypto_blkcipher *tfm, u8 *rc4key,
+			       size_t klen, u8 *data, size_t data_len)
 {
 	struct blkcipher_desc desc = { .tfm = tfm };
 	struct scatterlist sg;
 	__le32 icv;
 
+	if (IS_ERR(tfm))
+		return -1;
+
 	icv = cpu_to_le32(~crc32_le(~0, data, data_len));
 	put_unaligned(icv, (__le32 *)(data + data_len));
 
 	crypto_blkcipher_setkey(tfm, rc4key, klen);
 	sg_init_one(&sg, data, data_len + WEP_ICV_LEN);
 	crypto_blkcipher_encrypt(&desc, &sg, &sg, sg.length);
+
+	return 0;
 }
 
 
@@ -168,10 +178,8 @@ int ieee80211_wep_encrypt(struct ieee80211_local *local,
 	/* Add room for ICV */
 	skb_put(skb, WEP_ICV_LEN);
 
-	ieee80211_wep_encrypt_data(local->wep_tx_tfm, rc4key, keylen + 3,
-				   iv + WEP_IV_LEN, len);
-
-	return 0;
+	return ieee80211_wep_encrypt_data(local->wep_tx_tfm, rc4key, keylen + 3,
+					  iv + WEP_IV_LEN, len);
 }
 
 
@@ -185,6 +193,9 @@ int ieee80211_wep_decrypt_data(struct crypto_blkcipher *tfm, u8 *rc4key,
 	struct scatterlist sg;
 	__le32 crc;
 
+	if (IS_ERR(tfm))
+		return -1;
+
 	crypto_blkcipher_setkey(tfm, rc4key, klen);
 	sg_init_one(&sg, data, data_len + WEP_ICV_LEN);
 	crypto_blkcipher_decrypt(&desc, &sg, &sg, sg.length);
@@ -211,7 +222,7 @@ static int ieee80211_wep_decrypt(struct ieee80211_local *local,
 				 struct ieee80211_key *key)
 {
 	u32 klen;
-	u8 *rc4key;
+	u8 rc4key[3 + WLAN_KEY_LEN_WEP104];
 	u8 keyidx;
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
 	unsigned int hdrlen;
@@ -229,15 +240,11 @@ static int ieee80211_wep_decrypt(struct ieee80211_local *local,
 
 	keyidx = skb->data[hdrlen + 3] >> 6;
 
-	if (!key || keyidx != key->conf.keyidx || key->conf.alg != ALG_WEP)
+	if (!key || keyidx != key->conf.keyidx)
 		return -1;
 
 	klen = 3 + key->conf.keylen;
 
-	rc4key = kmalloc(klen, GFP_ATOMIC);
-	if (!rc4key)
-		return -1;
-
 	/* Prepend 24-bit IV to RC4 key */
 	memcpy(rc4key, skb->data + hdrlen, 3);
 
@@ -249,8 +256,6 @@ static int ieee80211_wep_decrypt(struct ieee80211_local *local,
 				       len))
 		ret = -1;
 
-	kfree(rc4key);
-
 	/* Trim ICV */
 	skb_trim(skb, skb->len - WEP_ICV_LEN);
 
diff --git a/net/mac80211/wep.h b/net/mac80211/wep.h
index fe29d7e5759..58654ee3351 100644
--- a/net/mac80211/wep.h
+++ b/net/mac80211/wep.h
@@ -18,7 +18,7 @@
 
 int ieee80211_wep_init(struct ieee80211_local *local);
 void ieee80211_wep_free(struct ieee80211_local *local);
-void ieee80211_wep_encrypt_data(struct crypto_blkcipher *tfm, u8 *rc4key,
+int ieee80211_wep_encrypt_data(struct crypto_blkcipher *tfm, u8 *rc4key,
 				size_t klen, u8 *data, size_t data_len);
 int ieee80211_wep_encrypt(struct ieee80211_local *local,
 			  struct sk_buff *skb,
diff --git a/net/mac80211/work.c b/net/mac80211/work.c
index b025dc7bb0f..ae344d1ba05 100644
--- a/net/mac80211/work.c
+++ b/net/mac80211/work.c
@@ -43,7 +43,7 @@ enum work_action {
 /* utils */
 static inline void ASSERT_WORK_MTX(struct ieee80211_local *local)
 {
-	WARN_ON(!mutex_is_locked(&local->work_mtx));
+	lockdep_assert_held(&local->mtx);
 }
 
 /*
@@ -560,6 +560,22 @@ ieee80211_remain_on_channel_timeout(struct ieee80211_work *wk)
 	return WORK_ACT_TIMEOUT;
 }
 
+static enum work_action __must_check
+ieee80211_assoc_beacon_wait(struct ieee80211_work *wk)
+{
+	if (wk->started)
+		return WORK_ACT_TIMEOUT;
+
+	/*
+	 * Wait up to one beacon interval ...
+	 * should this be more if we miss one?
+	 */
+	printk(KERN_DEBUG "%s: waiting for beacon from %pM\n",
+	       wk->sdata->name, wk->filter_ta);
+	wk->timeout = TU_TO_EXP_TIME(wk->assoc.bss->beacon_interval);
+	return WORK_ACT_NONE;
+}
+
 static void ieee80211_auth_challenge(struct ieee80211_work *wk,
 				     struct ieee80211_mgmt *mgmt,
 				     size_t len)
@@ -709,6 +725,25 @@ ieee80211_rx_mgmt_probe_resp(struct ieee80211_work *wk,
 	return WORK_ACT_DONE;
 }
 
+static enum work_action __must_check
+ieee80211_rx_mgmt_beacon(struct ieee80211_work *wk,
+			 struct ieee80211_mgmt *mgmt, size_t len)
+{
+	struct ieee80211_sub_if_data *sdata = wk->sdata;
+	struct ieee80211_local *local = sdata->local;
+
+	ASSERT_WORK_MTX(local);
+
+	if (wk->type != IEEE80211_WORK_ASSOC_BEACON_WAIT)
+		return WORK_ACT_MISMATCH;
+
+	if (len < 24 + 12)
+		return WORK_ACT_NONE;
+
+	printk(KERN_DEBUG "%s: beacon received\n", sdata->name);
+	return WORK_ACT_DONE;
+}
+
 static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local,
 					  struct sk_buff *skb)
 {
@@ -722,7 +757,7 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local,
 	mgmt = (struct ieee80211_mgmt *) skb->data;
 	fc = le16_to_cpu(mgmt->frame_control);
 
-	mutex_lock(&local->work_mtx);
+	mutex_lock(&local->mtx);
 
 	list_for_each_entry(wk, &local->work_list, list) {
 		const u8 *bssid = NULL;
@@ -731,6 +766,7 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local,
 		case IEEE80211_WORK_DIRECT_PROBE:
 		case IEEE80211_WORK_AUTH:
 		case IEEE80211_WORK_ASSOC:
+		case IEEE80211_WORK_ASSOC_BEACON_WAIT:
 			bssid = wk->filter_ta;
 			break;
 		default:
@@ -745,6 +781,9 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local,
 			continue;
 
 		switch (fc & IEEE80211_FCTL_STYPE) {
+		case IEEE80211_STYPE_BEACON:
+			rma = ieee80211_rx_mgmt_beacon(wk, mgmt, skb->len);
+			break;
 		case IEEE80211_STYPE_PROBE_RESP:
 			rma = ieee80211_rx_mgmt_probe_resp(wk, mgmt, skb->len,
 							   rx_status);
@@ -794,7 +833,7 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local,
 		WARN(1, "unexpected: %d", rma);
 	}
 
-	mutex_unlock(&local->work_mtx);
+	mutex_unlock(&local->mtx);
 
 	if (rma != WORK_ACT_DONE)
 		goto out;
@@ -806,9 +845,9 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local,
 	case WORK_DONE_REQUEUE:
 		synchronize_rcu();
 		wk->started = false; /* restart */
-		mutex_lock(&local->work_mtx);
+		mutex_lock(&local->mtx);
 		list_add_tail(&wk->list, &local->work_list);
-		mutex_unlock(&local->work_mtx);
+		mutex_unlock(&local->mtx);
 	}
 
  out:
@@ -840,7 +879,7 @@ static void ieee80211_work_work(struct work_struct *work)
 
 	/*
 	 * ieee80211_queue_work() should have picked up most cases,
-	 * here we'll pick the the rest.
+	 * here we'll pick the rest.
 	 */
 	if (WARN(local->suspended, "work scheduled while going to suspend\n"))
 		return;
@@ -849,9 +888,9 @@ static void ieee80211_work_work(struct work_struct *work)
 	while ((skb = skb_dequeue(&local->work_skb_queue)))
 		ieee80211_work_rx_queued_mgmt(local, skb);
 
-	ieee80211_recalc_idle(local);
+	mutex_lock(&local->mtx);
 
-	mutex_lock(&local->work_mtx);
+	ieee80211_recalc_idle(local);
 
 	list_for_each_entry_safe(wk, tmp, &local->work_list, list) {
 		bool started = wk->started;
@@ -916,6 +955,9 @@ static void ieee80211_work_work(struct work_struct *work)
 		case IEEE80211_WORK_REMAIN_ON_CHANNEL:
 			rma = ieee80211_remain_on_channel_timeout(wk);
 			break;
+		case IEEE80211_WORK_ASSOC_BEACON_WAIT:
+			rma = ieee80211_assoc_beacon_wait(wk);
+			break;
 		}
 
 		wk->started = started;
@@ -953,20 +995,16 @@ static void ieee80211_work_work(struct work_struct *work)
 		run_again(local, jiffies + HZ/2);
 	}
 
-	mutex_lock(&local->scan_mtx);
-
 	if (list_empty(&local->work_list) && local->scan_req &&
 	    !local->scanning)
 		ieee80211_queue_delayed_work(&local->hw,
 					     &local->scan_work,
 					     round_jiffies_relative(0));
 
-	mutex_unlock(&local->scan_mtx);
-
-	mutex_unlock(&local->work_mtx);
-
 	ieee80211_recalc_idle(local);
 
+	mutex_unlock(&local->mtx);
+
 	list_for_each_entry_safe(wk, tmp, &free_work, list) {
 		wk->done(wk, NULL);
 		list_del(&wk->list);
@@ -993,16 +1031,15 @@ void ieee80211_add_work(struct ieee80211_work *wk)
 	wk->started = false;
 
 	local = wk->sdata->local;
-	mutex_lock(&local->work_mtx);
+	mutex_lock(&local->mtx);
 	list_add_tail(&wk->list, &local->work_list);
-	mutex_unlock(&local->work_mtx);
+	mutex_unlock(&local->mtx);
 
 	ieee80211_queue_work(&local->hw, &local->work_work);
 }
 
 void ieee80211_work_init(struct ieee80211_local *local)
 {
-	mutex_init(&local->work_mtx);
 	INIT_LIST_HEAD(&local->work_list);
 	setup_timer(&local->work_timer, ieee80211_work_timer,
 		    (unsigned long)local);
@@ -1015,7 +1052,7 @@ void ieee80211_work_purge(struct ieee80211_sub_if_data *sdata)
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_work *wk;
 
-	mutex_lock(&local->work_mtx);
+	mutex_lock(&local->mtx);
 	list_for_each_entry(wk, &local->work_list, list) {
 		if (wk->sdata != sdata)
 			continue;
@@ -1023,19 +1060,19 @@ void ieee80211_work_purge(struct ieee80211_sub_if_data *sdata)
 		wk->started = true;
 		wk->timeout = jiffies;
 	}
-	mutex_unlock(&local->work_mtx);
+	mutex_unlock(&local->mtx);
 
 	/* run cleanups etc. */
 	ieee80211_work_work(&local->work_work);
 
-	mutex_lock(&local->work_mtx);
+	mutex_lock(&local->mtx);
 	list_for_each_entry(wk, &local->work_list, list) {
 		if (wk->sdata != sdata)
 			continue;
 		WARN_ON(1);
 		break;
 	}
-	mutex_unlock(&local->work_mtx);
+	mutex_unlock(&local->mtx);
 }
 
 ieee80211_rx_result ieee80211_work_rx_mgmt(struct ieee80211_sub_if_data *sdata,
@@ -1065,6 +1102,7 @@ ieee80211_rx_result ieee80211_work_rx_mgmt(struct ieee80211_sub_if_data *sdata,
 		case IEEE80211_STYPE_PROBE_RESP:
 		case IEEE80211_STYPE_ASSOC_RESP:
 		case IEEE80211_STYPE_REASSOC_RESP:
+		case IEEE80211_STYPE_BEACON:
 			skb_queue_tail(&local->work_skb_queue, skb);
 			ieee80211_queue_work(&local->hw, &local->work_work);
 			return RX_QUEUED;
@@ -1120,7 +1158,7 @@ int ieee80211_wk_cancel_remain_on_channel(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_work *wk, *tmp;
 	bool found = false;
 
-	mutex_lock(&local->work_mtx);
+	mutex_lock(&local->mtx);
 	list_for_each_entry_safe(wk, tmp, &local->work_list, list) {
 		if ((unsigned long) wk == cookie) {
 			wk->timeout = jiffies;
@@ -1128,7 +1166,7 @@ int ieee80211_wk_cancel_remain_on_channel(struct ieee80211_sub_if_data *sdata,
 			break;
 		}
 	}
-	mutex_unlock(&local->work_mtx);
+	mutex_unlock(&local->mtx);
 
 	if (!found)
 		return -ENOENT;
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 0adbcc941ac..bee230d8fd1 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -36,8 +36,8 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
 	int tail;
 
 	hdr = (struct ieee80211_hdr *)skb->data;
-	if (!tx->key || tx->key->conf.alg != ALG_TKIP || skb->len < 24 ||
-	    !ieee80211_is_data_present(hdr->frame_control))
+	if (!tx->key || tx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP ||
+	    skb->len < 24 || !ieee80211_is_data_present(hdr->frame_control))
 		return TX_CONTINUE;
 
 	hdrlen = ieee80211_hdrlen(hdr->frame_control);
@@ -94,7 +94,7 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
 	if (status->flag & RX_FLAG_MMIC_STRIPPED)
 		return RX_CONTINUE;
 
-	if (!rx->key || rx->key->conf.alg != ALG_TKIP ||
+	if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP ||
 	    !ieee80211_has_protected(hdr->frame_control) ||
 	    !ieee80211_is_data_present(hdr->frame_control))
 		return RX_CONTINUE;
@@ -117,7 +117,7 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
 	key = &rx->key->conf.key[key_offset];
 	michael_mic(key, hdr, data, data_len, mic);
 	if (memcmp(mic, data + data_len, MICHAEL_MIC_LEN) != 0 || wpa_test) {
-		if (!(rx->flags & IEEE80211_RX_RA_MATCH))
+		if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
 			return RX_DROP_UNUSABLE;
 
 		mac80211_ev_michael_mic_failure(rx->sdata, rx->key->conf.keyidx,
@@ -183,9 +183,8 @@ static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
 	skb_put(skb, TKIP_ICV_LEN);
 
 	hdr = (struct ieee80211_hdr *) skb->data;
-	ieee80211_tkip_encrypt_data(tx->local->wep_tx_tfm,
-				    key, pos, len, hdr->addr2);
-	return 0;
+	return ieee80211_tkip_encrypt_data(tx->local->wep_tx_tfm,
+					   key, pos, len, hdr->addr2);
 }
 
 
@@ -222,19 +221,13 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx)
 	if (!rx->sta || skb->len - hdrlen < 12)
 		return RX_DROP_UNUSABLE;
 
-	if (status->flag & RX_FLAG_DECRYPTED) {
-		if (status->flag & RX_FLAG_IV_STRIPPED) {
-			/*
-			 * Hardware took care of all processing, including
-			 * replay protection, and stripped the ICV/IV so
-			 * we cannot do any checks here.
-			 */
-			return RX_CONTINUE;
-		}
-
-		/* let TKIP code verify IV, but skip decryption */
+	/*
+	 * Let TKIP code verify IV, but skip decryption.
+	 * In the case where hardware checks the IV as well,
+	 * we don't even get here, see ieee80211_rx_h_decrypt()
+	 */
+	if (status->flag & RX_FLAG_DECRYPTED)
 		hwaccel = 1;
-	}
 
 	res = ieee80211_tkip_decrypt_data(rx->local->wep_rx_tfm,
 					  key, skb->data + hdrlen,
@@ -436,6 +429,7 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx)
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
 	u8 pn[CCMP_PN_LEN];
 	int data_len;
+	int queue;
 
 	hdrlen = ieee80211_hdrlen(hdr->frame_control);
 
@@ -447,13 +441,12 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx)
 	if (!rx->sta || data_len < 0)
 		return RX_DROP_UNUSABLE;
 
-	if ((status->flag & RX_FLAG_DECRYPTED) &&
-	    (status->flag & RX_FLAG_IV_STRIPPED))
-		return RX_CONTINUE;
-
 	ccmp_hdr2pn(pn, skb->data + hdrlen);
 
-	if (memcmp(pn, key->u.ccmp.rx_pn[rx->queue], CCMP_PN_LEN) <= 0) {
+	queue = ieee80211_is_mgmt(hdr->frame_control) ?
+		NUM_RX_DATA_QUEUES : rx->queue;
+
+	if (memcmp(pn, key->u.ccmp.rx_pn[queue], CCMP_PN_LEN) <= 0) {
 		key->u.ccmp.replays++;
 		return RX_DROP_UNUSABLE;
 	}
@@ -470,7 +463,7 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx)
 			return RX_DROP_UNUSABLE;
 	}
 
-	memcpy(key->u.ccmp.rx_pn[rx->queue], pn, CCMP_PN_LEN);
+	memcpy(key->u.ccmp.rx_pn[queue], pn, CCMP_PN_LEN);
 
 	/* Remove CCMP header and MIC */
 	skb_trim(skb, skb->len - CCMP_MIC_LEN);
@@ -561,10 +554,6 @@ ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
 	if (!ieee80211_is_mgmt(hdr->frame_control))
 		return RX_CONTINUE;
 
-	if ((status->flag & RX_FLAG_DECRYPTED) &&
-	    (status->flag & RX_FLAG_IV_STRIPPED))
-		return RX_CONTINUE;
-
 	if (skb->len < 24 + sizeof(*mmie))
 		return RX_DROP_UNUSABLE;
 
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 8593a77cfea..1534f2b44ca 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -40,27 +40,6 @@ config NF_CONNTRACK
 
 if NF_CONNTRACK
 
-config NF_CT_ACCT
-	bool "Connection tracking flow accounting"
-	depends on NETFILTER_ADVANCED
-	help
-	  If this option is enabled, the connection tracking code will
-	  keep per-flow packet and byte counters.
-
-	  Those counters can be used for flow-based accounting or the
-	  `connbytes' match.
-
-	  Please note that currently this option only sets a default state.
-	  You may change it at boot time with nf_conntrack.acct=0/1 kernel
-	  parameter or by loading the nf_conntrack module with acct=0/1.
-
-	  You may also disable/enable it on a running system with:
-	   sysctl net.netfilter.nf_conntrack_acct=0/1
-
-	  This option will be removed in 2.6.29.
-
-	  If unsure, say `N'.
-
 config NF_CONNTRACK_MARK
 	bool  'Connection mark tracking support'
 	depends on NETFILTER_ADVANCED
@@ -347,6 +326,22 @@ config NETFILTER_XT_CONNMARK
 
 comment "Xtables targets"
 
+config NETFILTER_XT_TARGET_CHECKSUM
+	tristate "CHECKSUM target support"
+	depends on IP_NF_MANGLE || IP6_NF_MANGLE
+	depends on NETFILTER_ADVANCED
+	---help---
+	  This option adds a `CHECKSUM' target, which can be used in the iptables mangle
+	  table.
+
+	  You can use this target to compute and fill in the checksum in
+	  a packet that lacks a checksum.  This is particularly useful,
+	  if you need to work around old applications such as dhcp clients,
+	  that do not work well with checksum offloads, but don't want to disable
+	  checksum offload in your device.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_TARGET_CLASSIFY
 	tristate '"CLASSIFY" target support'
 	depends on NETFILTER_ADVANCED
@@ -424,6 +419,18 @@ config NETFILTER_XT_TARGET_HL
 	since you can easily create immortal packets that loop
 	forever on the network.
 
+config NETFILTER_XT_TARGET_IDLETIMER
+	tristate  "IDLETIMER target support"
+	depends on NETFILTER_ADVANCED
+	help
+
+	  This option adds the `IDLETIMER' target.  Each matching packet
+	  resets the timer associated with label specified when the rule is
+	  added.  When the timer expires, it triggers a sysfs notification.
+	  The remaining time for expiration can be read via sysfs.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_TARGET_LED
 	tristate '"LED" target support'
 	depends on LEDS_CLASS && LEDS_TRIGGERS
@@ -503,7 +510,7 @@ config NETFILTER_XT_TARGET_RATEEST
 	  To compile it as a module, choose M here.  If unsure, say N.
 
 config NETFILTER_XT_TARGET_TEE
-	tristate '"TEE" - packet cloning to alternate destiantion'
+	tristate '"TEE" - packet cloning to alternate destination'
 	depends on NETFILTER_ADVANCED
 	depends on (IPV6 || IPV6=n)
 	depends on !NF_CONNTRACK || NF_CONNTRACK
@@ -518,6 +525,7 @@ config NETFILTER_XT_TARGET_TPROXY
 	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	select NF_DEFRAG_IPV4
+	select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
 	help
 	  This option adds a `TPROXY' target, which is somewhat similar to
 	  REDIRECT.  It can only be used in the mangle table and is useful
@@ -618,7 +626,6 @@ config NETFILTER_XT_MATCH_CONNBYTES
 	tristate  '"connbytes" per-connection counter match support'
 	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
-	select NF_CT_ACCT
 	help
 	  This option adds a `connbytes' match, which allows you to match the
 	  number of bytes and/or packets for each direction within a connection.
@@ -657,6 +664,15 @@ config NETFILTER_XT_MATCH_CONNTRACK
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_MATCH_CPU
+	tristate '"cpu" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  CPU matching allows you to match packets based on the CPU
+	  currently handling the packet.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_MATCH_DCCP
 	tristate '"dccp" protocol match support'
 	depends on NETFILTER_ADVANCED
@@ -736,6 +752,16 @@ config NETFILTER_XT_MATCH_IPRANGE
 
 	If unsure, say M.
 
+config NETFILTER_XT_MATCH_IPVS
+	tristate '"ipvs" match support'
+	depends on IP_VS
+	depends on NETFILTER_ADVANCED
+	depends on NF_CONNTRACK
+	help
+	  This option allows you to match against IPVS properties of a packet.
+
+	  If unsure, say N.
+
 config NETFILTER_XT_MATCH_LENGTH
 	tristate '"length" match support'
 	depends on NETFILTER_ADVANCED
@@ -902,6 +928,7 @@ config NETFILTER_XT_MATCH_SOCKET
 	depends on NETFILTER_ADVANCED
 	depends on !NF_CONNTRACK || NF_CONNTRACK
 	select NF_DEFRAG_IPV4
+	select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
 	help
 	  This option adds a `socket' match, which can be used to match
 	  packets for which a TCP or UDP socket lookup finds a valid socket.
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 14e3a8fd818..441050f3111 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -45,6 +45,7 @@ obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
 obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
 
 # targets
+obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
@@ -61,6 +62,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TEE) += xt_TEE.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o
 
 # matches
 obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o
@@ -68,6 +70,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o
@@ -75,6 +78,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 78b505d33bf..85dabb86be6 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -27,7 +27,7 @@
 
 static DEFINE_MUTEX(afinfo_mutex);
 
-const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
+const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
 EXPORT_SYMBOL(nf_afinfo);
 
 int nf_register_afinfo(const struct nf_afinfo *afinfo)
@@ -105,10 +105,8 @@ EXPORT_SYMBOL(nf_register_hooks);
 
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 {
-	unsigned int i;
-
-	for (i = 0; i < n; i++)
-		nf_unregister_hook(&reg[i]);
+	while (n-- > 0)
+		nf_unregister_hook(&reg[n]);
 }
 EXPORT_SYMBOL(nf_unregister_hooks);
 
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 712ccad1334..a22dac22705 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -26,7 +26,7 @@ if IP_VS
 
 config	IP_VS_IPV6
 	bool "IPv6 support for IPVS"
-	depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6)
+	depends on IPV6 = y || IP_VS = IPV6
 	---help---
 	  Add IPv6 support to IPVS. This is incomplete and might be dangerous.
 
@@ -87,19 +87,16 @@ config	IP_VS_PROTO_UDP
 	  protocol. Say Y if unsure.
 
 config	IP_VS_PROTO_AH_ESP
-	bool
-	depends on UNDEFINED
+	def_bool IP_VS_PROTO_ESP || IP_VS_PROTO_AH
 
 config	IP_VS_PROTO_ESP
 	bool "ESP load balancing support"
-	select IP_VS_PROTO_AH_ESP
 	---help---
 	  This option enables support for load balancing ESP (Encapsulation
 	  Security Payload) transport protocol. Say Y if unsure.
 
 config	IP_VS_PROTO_AH
 	bool "AH load balancing support"
-	select IP_VS_PROTO_AH_ESP
 	---help---
 	  This option enables support for load balancing AH (Authentication
 	  Header) transport protocol. Say Y if unsure.
@@ -238,7 +235,8 @@ comment 'IPVS application helper'
 
 config	IP_VS_FTP
   	tristate "FTP protocol helper"
-        depends on IP_VS_PROTO_TCP
+        depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT
+	select IP_VS_NFCT
 	---help---
 	  FTP is a protocol that transfers IP address and/or port number in
 	  the payload. In the virtual server via Network Address Translation,
@@ -250,4 +248,19 @@ config	IP_VS_FTP
 	  If you want to compile it in kernel, say Y. To compile it as a
 	  module, choose M here. If unsure, say N.
 
+config	IP_VS_NFCT
+	bool "Netfilter connection tracking"
+	depends on NF_CONNTRACK
+	---help---
+	  The Netfilter connection tracking support allows the IPVS
+	  connection state to be exported to the Netfilter framework
+	  for filtering purposes.
+
+config	IP_VS_PE_SIP
+	tristate "SIP persistence engine"
+        depends on IP_VS_PROTO_UDP
+	depends on NF_CONNTRACK_SIP
+	---help---
+	  Allow persistence based on the SIP Call-ID
+
 endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index e3baefd7066..34ee602ddb6 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -9,10 +9,13 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
 ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
 ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o
 
+ip_vs-extra_objs-y :=
+ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o
+
 ip_vs-objs :=	ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o	   \
 		ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o	   		   \
-		ip_vs_est.o ip_vs_proto.o 				   \
-		$(ip_vs_proto-objs-y)
+		ip_vs_est.o ip_vs_proto.o ip_vs_pe.o			   \
+		$(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y)
 
 
 # IPVS core
@@ -32,3 +35,6 @@ obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
 
 # IPVS application helpers
 obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
+
+# IPVS connection template retrievers
+obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 1cb0e834f8f..a475edee091 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -103,8 +103,8 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
 		goto out;
 
 	list_add(&inc->a_list, &app->incs_list);
-	IP_VS_DBG(9, "%s application %s:%u registered\n",
-		  pp->name, inc->name, inc->port);
+	IP_VS_DBG(9, "%s App %s:%u registered\n",
+		  pp->name, inc->name, ntohs(inc->port));
 
 	return 0;
 
@@ -130,7 +130,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc)
 		pp->unregister_app(inc);
 
 	IP_VS_DBG(9, "%s App %s:%u unregistered\n",
-		  pp->name, inc->name, inc->port);
+		  pp->name, inc->name, ntohs(inc->port));
 
 	list_del(&inc->a_list);
 
@@ -569,49 +569,6 @@ static const struct file_operations ip_vs_app_fops = {
 };
 #endif
 
-
-/*
- *	Replace a segment of data with a new segment
- */
-int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
-		      char *o_buf, int o_len, char *n_buf, int n_len)
-{
-	int diff;
-	int o_offset;
-	int o_left;
-
-	EnterFunction(9);
-
-	diff = n_len - o_len;
-	o_offset = o_buf - (char *)skb->data;
-	/* The length of left data after o_buf+o_len in the skb data */
-	o_left = skb->len - (o_offset + o_len);
-
-	if (diff <= 0) {
-		memmove(o_buf + n_len, o_buf + o_len, o_left);
-		memcpy(o_buf, n_buf, n_len);
-		skb_trim(skb, skb->len + diff);
-	} else if (diff <= skb_tailroom(skb)) {
-		skb_put(skb, diff);
-		memmove(o_buf + n_len, o_buf + o_len, o_left);
-		memcpy(o_buf, n_buf, n_len);
-	} else {
-		if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
-			return -ENOMEM;
-		skb_put(skb, diff);
-		memmove(skb->data + o_offset + n_len,
-			skb->data + o_offset + o_len, o_left);
-		skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len);
-	}
-
-	/* must update the iph total length here */
-	ip_hdr(skb)->tot_len = htons(skb->len);
-
-	LeaveFunction(9);
-	return 0;
-}
-
-
 int __init ip_vs_app_init(void)
 {
 	/* we will replace it with proc_net_ipvs_create() soon */
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index ff04e9edbed..e9adecdc8ca 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -148,6 +148,42 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
 		& ip_vs_conn_tab_mask;
 }
 
+static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
+					     bool inverse)
+{
+	const union nf_inet_addr *addr;
+	__be16 port;
+
+	if (p->pe_data && p->pe->hashkey_raw)
+		return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
+			ip_vs_conn_tab_mask;
+
+	if (likely(!inverse)) {
+		addr = p->caddr;
+		port = p->cport;
+	} else {
+		addr = p->vaddr;
+		port = p->vport;
+	}
+
+	return ip_vs_conn_hashkey(p->af, p->protocol, addr, port);
+}
+
+static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
+{
+	struct ip_vs_conn_param p;
+
+	ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport,
+			      NULL, 0, &p);
+
+	if (cp->dest && cp->dest->svc->pe) {
+		p.pe = cp->dest->svc->pe;
+		p.pe_data = cp->pe_data;
+		p.pe_data_len = cp->pe_data_len;
+	}
+
+	return ip_vs_conn_hashkey_param(&p, false);
+}
 
 /*
  *	Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
@@ -158,8 +194,11 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 	unsigned hash;
 	int ret;
 
+	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+		return 0;
+
 	/* Hash by protocol, client address and port */
-	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+	hash = ip_vs_conn_hashkey_conn(cp);
 
 	ct_write_lock(hash);
 	spin_lock(&cp->lock);
@@ -192,7 +231,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 	int ret;
 
 	/* unhash it and decrease its reference counter */
-	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
+	hash = ip_vs_conn_hashkey_conn(cp);
 
 	ct_write_lock(hash);
 	spin_lock(&cp->lock);
@@ -215,27 +254,26 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 /*
  *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
  *  Called for pkts coming from OUTside-to-INside.
- *	s_addr, s_port: pkt source address (foreign host)
- *	d_addr, d_port: pkt dest address (load balancer)
+ *	p->caddr, p->cport: pkt source address (foreign host)
+ *	p->vaddr, p->vport: pkt dest address (load balancer)
  */
-static inline struct ip_vs_conn *__ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+static inline struct ip_vs_conn *
+__ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 {
 	unsigned hash;
 	struct ip_vs_conn *cp;
 
-	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+	hash = ip_vs_conn_hashkey_param(p, false);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == af &&
-		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
-		    ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
-		    s_port == cp->cport && d_port == cp->vport &&
-		    ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
-		    protocol == cp->protocol) {
+		if (cp->af == p->af &&
+		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
+		    ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
+		    p->cport == cp->cport && p->vport == cp->vport &&
+		    ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
+		    p->protocol == cp->protocol) {
 			/* HIT */
 			atomic_inc(&cp->refcnt);
 			ct_read_unlock(hash);
@@ -248,76 +286,111 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get
 	return NULL;
 }
 
-struct ip_vs_conn *ip_vs_conn_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
 {
 	struct ip_vs_conn *cp;
 
-	cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
-	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
-		cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
-					 d_port);
+	cp = __ip_vs_conn_in_get(p);
+	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
+		struct ip_vs_conn_param cport_zero_p = *p;
+		cport_zero_p.cport = 0;
+		cp = __ip_vs_conn_in_get(&cport_zero_p);
+	}
 
 	IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
-		      ip_vs_proto_name(protocol),
-		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
 		      cp ? "hit" : "not hit");
 
 	return cp;
 }
 
+static int
+ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
+			    const struct ip_vs_iphdr *iph,
+			    unsigned int proto_off, int inverse,
+			    struct ip_vs_conn_param *p)
+{
+	__be16 _ports[2], *pptr;
+
+	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+	if (pptr == NULL)
+		return 1;
+
+	if (likely(!inverse))
+		ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0],
+				      &iph->daddr, pptr[1], p);
+	else
+		ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1],
+				      &iph->saddr, pptr[0], p);
+	return 0;
+}
+
+struct ip_vs_conn *
+ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
+			struct ip_vs_protocol *pp,
+			const struct ip_vs_iphdr *iph,
+			unsigned int proto_off, int inverse)
+{
+	struct ip_vs_conn_param p;
+
+	if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
+		return NULL;
+
+	return ip_vs_conn_in_get(&p);
+}
+EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
+
 /* Get reference to connection template */
-struct ip_vs_conn *ip_vs_ct_in_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
 {
 	unsigned hash;
 	struct ip_vs_conn *cp;
 
-	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
+	hash = ip_vs_conn_hashkey_param(p, false);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == af &&
-		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
+		if (p->pe_data && p->pe->ct_match) {
+			if (p->pe->ct_match(p, cp))
+				goto out;
+			continue;
+		}
+
+		if (cp->af == p->af &&
+		    ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
 		    /* protocol should only be IPPROTO_IP if
-		     * d_addr is a fwmark */
-		    ip_vs_addr_equal(protocol == IPPROTO_IP ? AF_UNSPEC : af,
-		                     d_addr, &cp->vaddr) &&
-		    s_port == cp->cport && d_port == cp->vport &&
+		     * p->vaddr is a fwmark */
+		    ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
+				     p->af, p->vaddr, &cp->vaddr) &&
+		    p->cport == cp->cport && p->vport == cp->vport &&
 		    cp->flags & IP_VS_CONN_F_TEMPLATE &&
-		    protocol == cp->protocol) {
-			/* HIT */
-			atomic_inc(&cp->refcnt);
+		    p->protocol == cp->protocol)
 			goto out;
-		}
 	}
 	cp = NULL;
 
   out:
+	if (cp)
+		atomic_inc(&cp->refcnt);
 	ct_read_unlock(hash);
 
 	IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
-		      ip_vs_proto_name(protocol),
-		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
 		      cp ? "hit" : "not hit");
 
 	return cp;
 }
 
-/*
- *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
- *  Called for pkts coming from inside-to-OUTside.
- *	s_addr, s_port: pkt source address (inside host)
- *	d_addr, d_port: pkt dest address (foreign host)
- */
-struct ip_vs_conn *ip_vs_conn_out_get
-(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
- const union nf_inet_addr *d_addr, __be16 d_port)
+/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from inside-to-OUTside.
+ *	p->caddr, p->cport: pkt source address (inside host)
+ *	p->vaddr, p->vport: pkt dest address (foreign host) */
+struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
 {
 	unsigned hash;
 	struct ip_vs_conn *cp, *ret=NULL;
@@ -325,16 +398,16 @@ struct ip_vs_conn *ip_vs_conn_out_get
 	/*
 	 *	Check for "full" addressed entries
 	 */
-	hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
+	hash = ip_vs_conn_hashkey_param(p, true);
 
 	ct_read_lock(hash);
 
 	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
-		if (cp->af == af &&
-		    ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
-		    ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
-		    d_port == cp->cport && s_port == cp->dport &&
-		    protocol == cp->protocol) {
+		if (cp->af == p->af &&
+		    ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
+		    ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
+		    p->vport == cp->cport && p->cport == cp->dport &&
+		    p->protocol == cp->protocol) {
 			/* HIT */
 			atomic_inc(&cp->refcnt);
 			ret = cp;
@@ -345,22 +418,37 @@ struct ip_vs_conn *ip_vs_conn_out_get
 	ct_read_unlock(hash);
 
 	IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
-		      ip_vs_proto_name(protocol),
-		      IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
-		      IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
 		      ret ? "hit" : "not hit");
 
 	return ret;
 }
 
+struct ip_vs_conn *
+ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
+			 struct ip_vs_protocol *pp,
+			 const struct ip_vs_iphdr *iph,
+			 unsigned int proto_off, int inverse)
+{
+	struct ip_vs_conn_param p;
+
+	if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
+		return NULL;
+
+	return ip_vs_conn_out_get(&p);
+}
+EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
 
 /*
  *      Put back the conn and restart its timer with its timeout
  */
 void ip_vs_conn_put(struct ip_vs_conn *cp)
 {
-	/* reset it expire in its timeout */
-	mod_timer(&cp->timer, jiffies+cp->timeout);
+	unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
+		0 : cp->timeout;
+	mod_timer(&cp->timer, jiffies+t);
 
 	__ip_vs_conn_put(cp);
 }
@@ -456,6 +544,8 @@ static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
 static inline void
 ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
 {
+	unsigned int conn_flags;
+
 	/* if dest is NULL, then return directly */
 	if (!dest)
 		return;
@@ -463,16 +553,20 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
 	/* Increase the refcnt counter of the dest */
 	atomic_inc(&dest->refcnt);
 
+	conn_flags = atomic_read(&dest->conn_flags);
+	if (cp->protocol != IPPROTO_UDP)
+		conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
 	/* Bind with the destination and its corresponding transmitter */
-	if ((cp->flags & IP_VS_CONN_F_SYNC) &&
-	    (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
+	if (cp->flags & IP_VS_CONN_F_SYNC) {
 		/* if the connection is not template and is created
 		 * by sync, preserve the activity flag.
 		 */
-		cp->flags |= atomic_read(&dest->conn_flags) &
-			     (~IP_VS_CONN_F_INACTIVE);
-	else
-		cp->flags |= atomic_read(&dest->conn_flags);
+		if (!(cp->flags & IP_VS_CONN_F_TEMPLATE))
+			conn_flags &= ~IP_VS_CONN_F_INACTIVE;
+		/* connections inherit forwarding method from dest */
+		cp->flags &= ~IP_VS_CONN_F_FWD_MASK;
+	}
+	cp->flags |= conn_flags;
 	cp->dest = dest;
 
 	IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
@@ -653,7 +747,7 @@ static void ip_vs_conn_expire(unsigned long data)
 	/*
 	 *	unhash it if it is hashed in the conn table
 	 */
-	if (!ip_vs_conn_unhash(cp))
+	if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET))
 		goto expire_later;
 
 	/*
@@ -668,6 +762,10 @@ static void ip_vs_conn_expire(unsigned long data)
 		if (cp->control)
 			ip_vs_control_del(cp);
 
+		if (cp->flags & IP_VS_CONN_F_NFCT)
+			ip_vs_conn_drop_conntrack(cp);
+
+		kfree(cp->pe_data);
 		if (unlikely(cp->app != NULL))
 			ip_vs_unbind_app(cp);
 		ip_vs_unbind_dest(cp);
@@ -702,13 +800,12 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
  *	Create a new connection entry and hash it into the ip_vs_conn_tab
  */
 struct ip_vs_conn *
-ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
-	       const union nf_inet_addr *vaddr, __be16 vport,
+ip_vs_conn_new(const struct ip_vs_conn_param *p,
 	       const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
 	       struct ip_vs_dest *dest)
 {
 	struct ip_vs_conn *cp;
-	struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+	struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol);
 
 	cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
 	if (cp == NULL) {
@@ -718,17 +815,21 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
 
 	INIT_LIST_HEAD(&cp->c_list);
 	setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
-	cp->af		   = af;
-	cp->protocol	   = proto;
-	ip_vs_addr_copy(af, &cp->caddr, caddr);
-	cp->cport	   = cport;
-	ip_vs_addr_copy(af, &cp->vaddr, vaddr);
-	cp->vport	   = vport;
+	cp->af		   = p->af;
+	cp->protocol	   = p->protocol;
+	ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
+	cp->cport	   = p->cport;
+	ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr);
+	cp->vport	   = p->vport;
 	/* proto should only be IPPROTO_IP if d_addr is a fwmark */
-	ip_vs_addr_copy(proto == IPPROTO_IP ? AF_UNSPEC : af,
+	ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
 			&cp->daddr, daddr);
 	cp->dport          = dport;
 	cp->flags	   = flags;
+	if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) {
+		cp->pe_data = p->pe_data;
+		cp->pe_data_len = p->pe_data_len;
+	}
 	spin_lock_init(&cp->lock);
 
 	/*
@@ -754,7 +855,7 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
 
 	/* Bind its packet transmitter */
 #ifdef CONFIG_IP_VS_IPV6
-	if (af == AF_INET6)
+	if (p->af == AF_INET6)
 		ip_vs_bind_xmit_v6(cp);
 	else
 #endif
@@ -763,13 +864,22 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
 	if (unlikely(pp && atomic_read(&pp->appcnt)))
 		ip_vs_bind_app(cp, pp);
 
+	/*
+	 * Allow conntrack to be preserved. By default, conntrack
+	 * is created and destroyed for every packet.
+	 * Sometimes keeping conntrack can be useful for
+	 * IP_VS_CONN_F_ONE_PACKET too.
+	 */
+
+	if (ip_vs_conntrack_enabled())
+		cp->flags |= IP_VS_CONN_F_NFCT;
+
 	/* Hash it in the ip_vs_conn_tab finally */
 	ip_vs_conn_hash(cp);
 
 	return cp;
 }
 
-
 /*
  *	/proc/net/ip_vs_conn entries
  */
@@ -785,7 +895,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
 		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
 			if (pos-- == 0) {
 				seq->private = &ip_vs_conn_tab[idx];
-				return cp;
+			return cp;
 			}
 		}
 		ct_read_unlock_bh(idx);
@@ -842,30 +952,45 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
 
 	if (v == SEQ_START_TOKEN)
 		seq_puts(seq,
-   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires\n");
+   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires PEName PEData\n");
 	else {
 		const struct ip_vs_conn *cp = v;
+		char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
+		size_t len = 0;
+
+		if (cp->dest && cp->pe_data &&
+		    cp->dest->svc->pe->show_pe_data) {
+			pe_data[0] = ' ';
+			len = strlen(cp->dest->svc->pe->name);
+			memcpy(pe_data + 1, cp->dest->svc->pe->name, len);
+			pe_data[len + 1] = ' ';
+			len += 2;
+			len += cp->dest->svc->pe->show_pe_data(cp,
+							       pe_data + len);
+		}
+		pe_data[len] = '\0';
 
 #ifdef CONFIG_IP_VS_IPV6
 		if (cp->af == AF_INET6)
-			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %7lu\n",
+			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
+				"%pI6 %04X %-11s %7lu%s\n",
 				ip_vs_proto_name(cp->protocol),
 				&cp->caddr.in6, ntohs(cp->cport),
 				&cp->vaddr.in6, ntohs(cp->vport),
 				&cp->daddr.in6, ntohs(cp->dport),
 				ip_vs_state_name(cp->protocol, cp->state),
-				(cp->timer.expires-jiffies)/HZ);
+				(cp->timer.expires-jiffies)/HZ, pe_data);
 		else
 #endif
 			seq_printf(seq,
 				"%-3s %08X %04X %08X %04X"
-				" %08X %04X %-11s %7lu\n",
+				" %08X %04X %-11s %7lu%s\n",
 				ip_vs_proto_name(cp->protocol),
 				ntohl(cp->caddr.ip), ntohs(cp->cport),
 				ntohl(cp->vaddr.ip), ntohs(cp->vport),
 				ntohl(cp->daddr.ip), ntohs(cp->dport),
 				ip_vs_state_name(cp->protocol, cp->state),
-				(cp->timer.expires-jiffies)/HZ);
+				(cp->timer.expires-jiffies)/HZ, pe_data);
 	}
 	return 0;
 }
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 1cd6e3fd058..b4e51e9c5a0 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -40,6 +40,7 @@
 #include <net/udp.h>
 #include <net/icmp.h>                   /* for icmp_send */
 #include <net/route.h>
+#include <net/ip6_checksum.h>
 
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
@@ -47,6 +48,7 @@
 #ifdef CONFIG_IP_VS_IPV6
 #include <net/ipv6.h>
 #include <linux/netfilter_ipv6.h>
+#include <net/ip6_route.h>
 #endif
 
 #include <net/ip_vs.h>
@@ -54,7 +56,6 @@
 
 EXPORT_SYMBOL(register_ip_vs_scheduler);
 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
-EXPORT_SYMBOL(ip_vs_skb_replace);
 EXPORT_SYMBOL(ip_vs_proto_name);
 EXPORT_SYMBOL(ip_vs_conn_new);
 EXPORT_SYMBOL(ip_vs_conn_in_get);
@@ -176,6 +177,18 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 	return pp->state_transition(cp, direction, skb, pp);
 }
 
+static inline void
+ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
+			      struct sk_buff *skb, int protocol,
+			      const union nf_inet_addr *caddr, __be16 cport,
+			      const union nf_inet_addr *vaddr, __be16 vport,
+			      struct ip_vs_conn_param *p)
+{
+	ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
+	p->pe = svc->pe;
+	if (p->pe && p->pe->fill_param)
+		p->pe->fill_param(p, skb);
+}
 
 /*
  *  IPVS persistent scheduling function
@@ -186,14 +199,16 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
  */
 static struct ip_vs_conn *
 ip_vs_sched_persist(struct ip_vs_service *svc,
-		    const struct sk_buff *skb,
+		    struct sk_buff *skb,
 		    __be16 ports[2])
 {
 	struct ip_vs_conn *cp = NULL;
 	struct ip_vs_iphdr iph;
 	struct ip_vs_dest *dest;
 	struct ip_vs_conn *ct;
-	__be16  dport;			/* destination port to forward */
+	__be16 dport = 0;		/* destination port to forward */
+	unsigned int flags;
+	struct ip_vs_conn_param param;
 	union nf_inet_addr snet;	/* source network of the client,
 					   after masking */
 
@@ -226,129 +241,85 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
 	 * is created for other persistent services.
 	 */
-	if (ports[1] == svc->port) {
-		/* Check if a template already exists */
-		if (svc->port != FTPPORT)
-			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-					     &iph.daddr, ports[1]);
-		else
-			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-					     &iph.daddr, 0);
-
-		if (!ct || !ip_vs_check_template(ct)) {
-			/*
-			 * No template found or the dest of the connection
-			 * template is not available.
-			 */
-			dest = svc->scheduler->schedule(svc, skb);
-			if (dest == NULL) {
-				IP_VS_DBG(1, "p-schedule: no dest found.\n");
-				return NULL;
-			}
-
-			/*
-			 * Create a template like <protocol,caddr,0,
-			 * vaddr,vport,daddr,dport> for non-ftp service,
-			 * and <protocol,caddr,0,vaddr,0,daddr,0>
-			 * for ftp service.
+	{
+		int protocol = iph.protocol;
+		const union nf_inet_addr *vaddr = &iph.daddr;
+		const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
+		__be16 vport = 0;
+
+		if (ports[1] == svc->port) {
+			/* non-FTP template:
+			 * <protocol, caddr, 0, vaddr, vport, daddr, dport>
+			 * FTP template:
+			 * <protocol, caddr, 0, vaddr, 0, daddr, 0>
 			 */
 			if (svc->port != FTPPORT)
-				ct = ip_vs_conn_new(svc->af, iph.protocol,
-						    &snet, 0,
-						    &iph.daddr,
-						    ports[1],
-						    &dest->addr, dest->port,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			else
-				ct = ip_vs_conn_new(svc->af, iph.protocol,
-						    &snet, 0,
-						    &iph.daddr, 0,
-						    &dest->addr, 0,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			if (ct == NULL)
-				return NULL;
-
-			ct->timeout = svc->timeout;
+				vport = ports[1];
 		} else {
-			/* set destination with the found template */
-			dest = ct->dest;
-		}
-		dport = dest->port;
-	} else {
-		/*
-		 * Note: persistent fwmark-based services and persistent
-		 * port zero service are handled here.
-		 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
-		 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
-		 */
-		if (svc->fwmark) {
-			union nf_inet_addr fwmark = {
-				.ip = htonl(svc->fwmark)
-			};
-
-			ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
-					     &fwmark, 0);
-		} else
-			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
-					     &iph.daddr, 0);
-
-		if (!ct || !ip_vs_check_template(ct)) {
-			/*
-			 * If it is not persistent port zero, return NULL,
-			 * otherwise create a connection template.
+			/* Note: persistent fwmark-based services and
+			 * persistent port zero service are handled here.
+			 * fwmark template:
+			 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+			 * port zero template:
+			 * <protocol,caddr,0,vaddr,0,daddr,0>
 			 */
-			if (svc->port)
-				return NULL;
-
-			dest = svc->scheduler->schedule(svc, skb);
-			if (dest == NULL) {
-				IP_VS_DBG(1, "p-schedule: no dest found.\n");
-				return NULL;
+			if (svc->fwmark) {
+				protocol = IPPROTO_IP;
+				vaddr = &fwmark;
 			}
+		}
+		ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+					      vaddr, vport, &param);
+	}
 
-			/*
-			 * Create a template according to the service
-			 */
-			if (svc->fwmark) {
-				union nf_inet_addr fwmark = {
-					.ip = htonl(svc->fwmark)
-				};
-
-				ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
-						    &snet, 0,
-						    &fwmark, 0,
-						    &dest->addr, 0,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			} else
-				ct = ip_vs_conn_new(svc->af, iph.protocol,
-						    &snet, 0,
-						    &iph.daddr, 0,
-						    &dest->addr, 0,
-						    IP_VS_CONN_F_TEMPLATE,
-						    dest);
-			if (ct == NULL)
-				return NULL;
-
-			ct->timeout = svc->timeout;
-		} else {
-			/* set destination with the found template */
-			dest = ct->dest;
+	/* Check if a template already exists */
+	ct = ip_vs_ct_in_get(&param);
+	if (!ct || !ip_vs_check_template(ct)) {
+		/* No template found or the dest of the connection
+		 * template is not available.
+		 */
+		dest = svc->scheduler->schedule(svc, skb);
+		if (!dest) {
+			IP_VS_DBG(1, "p-schedule: no dest found.\n");
+			kfree(param.pe_data);
+			return NULL;
 		}
-		dport = ports[1];
+
+		if (ports[1] == svc->port && svc->port != FTPPORT)
+			dport = dest->port;
+
+		/* Create a template
+		 * This adds param.pe_data to the template,
+		 * and thus param.pe_data will be destroyed
+		 * when the template expires */
+		ct = ip_vs_conn_new(&param, &dest->addr, dport,
+				    IP_VS_CONN_F_TEMPLATE, dest);
+		if (ct == NULL) {
+			kfree(param.pe_data);
+			return NULL;
+		}
+
+		ct->timeout = svc->timeout;
+	} else {
+		/* set destination with the found template */
+		dest = ct->dest;
+		kfree(param.pe_data);
 	}
 
+	dport = ports[1];
+	if (dport == svc->port && dest->port)
+		dport = dest->port;
+
+	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
+		 && iph.protocol == IPPROTO_UDP)?
+		IP_VS_CONN_F_ONE_PACKET : 0;
+
 	/*
 	 *    Create a new connection according to the template
 	 */
-	cp = ip_vs_conn_new(svc->af, iph.protocol,
-			    &iph.saddr, ports[0],
-			    &iph.daddr, ports[1],
-			    &dest->addr, dport,
-			    0,
-			    dest);
+	ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
+			      &iph.daddr, ports[1], &param);
+	cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest);
 	if (cp == NULL) {
 		ip_vs_conn_put(ct);
 		return NULL;
@@ -372,23 +343,53 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
  *  Protocols supported: TCP, UDP
  */
 struct ip_vs_conn *
-ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
+	       struct ip_vs_protocol *pp, int *ignored)
 {
 	struct ip_vs_conn *cp = NULL;
 	struct ip_vs_iphdr iph;
 	struct ip_vs_dest *dest;
 	__be16 _ports[2], *pptr;
+	unsigned int flags;
 
+	*ignored = 1;
 	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 	if (pptr == NULL)
 		return NULL;
 
 	/*
+	 * FTPDATA needs this check when using local real server.
+	 * Never schedule Active FTPDATA connections from real server.
+	 * For LVS-NAT they must be already created. For other methods
+	 * with persistence the connection is created on SYN+ACK.
+	 */
+	if (pptr[0] == FTPDATA) {
+		IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+			      "Not scheduling FTPDATA");
+		return NULL;
+	}
+
+	/*
+	 * Do not schedule replies from local real server. It is risky
+	 * for fwmark services but mostly for persistent services.
+	 */
+	if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+	    (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
+	    (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
+		IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+			      "Not scheduling reply for existing connection");
+		__ip_vs_conn_put(cp);
+		return NULL;
+	}
+
+	/*
 	 *    Persistent service
 	 */
-	if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+	if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
+		*ignored = 0;
 		return ip_vs_sched_persist(svc, skb, pptr);
+	}
 
 	/*
 	 *    Non-persistent service
@@ -401,23 +402,31 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 		return NULL;
 	}
 
+	*ignored = 0;
+
 	dest = svc->scheduler->schedule(svc, skb);
 	if (dest == NULL) {
 		IP_VS_DBG(1, "Schedule: no dest found.\n");
 		return NULL;
 	}
 
+	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
+		 && iph.protocol == IPPROTO_UDP)?
+		IP_VS_CONN_F_ONE_PACKET : 0;
+
 	/*
 	 *    Create a connection entry.
 	 */
-	cp = ip_vs_conn_new(svc->af, iph.protocol,
-			    &iph.saddr, pptr[0],
-			    &iph.daddr, pptr[1],
-			    &dest->addr, dest->port ? dest->port : pptr[1],
-			    0,
-			    dest);
-	if (cp == NULL)
-		return NULL;
+	{
+		struct ip_vs_conn_param p;
+		ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
+				      pptr[0], &iph.daddr, pptr[1], &p);
+		cp = ip_vs_conn_new(&p, &dest->addr,
+				    dest->port ? dest->port : pptr[1],
+				    flags, dest);
+		if (!cp)
+			return NULL;
+	}
 
 	IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
 		      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
@@ -464,20 +473,26 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 	if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
 		int ret, cs;
 		struct ip_vs_conn *cp;
+		unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
+				      iph.protocol == IPPROTO_UDP)?
+				      IP_VS_CONN_F_ONE_PACKET : 0;
 		union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
 
 		ip_vs_service_put(svc);
 
 		/* create a new connection entry */
 		IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
-		cp = ip_vs_conn_new(svc->af, iph.protocol,
-				    &iph.saddr, pptr[0],
-				    &iph.daddr, pptr[1],
-				    &daddr, 0,
-				    IP_VS_CONN_F_BYPASS,
-				    NULL);
-		if (cp == NULL)
-			return NF_DROP;
+		{
+			struct ip_vs_conn_param p;
+			ip_vs_conn_fill_param(svc->af, iph.protocol,
+					      &iph.saddr, pptr[0],
+					      &iph.daddr, pptr[1], &p);
+			cp = ip_vs_conn_new(&p, &daddr, 0,
+					    IP_VS_CONN_F_BYPASS | flags,
+					    NULL);
+			if (!cp)
+				return NF_DROP;
+		}
 
 		/* statistics */
 		ip_vs_in_stats(cp, skb);
@@ -515,38 +530,32 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 	 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
 	 */
 #ifdef CONFIG_IP_VS_IPV6
-	if (svc->af == AF_INET6)
+	if (svc->af == AF_INET6) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
-	else
+	} else
 #endif
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 
 	return NF_DROP;
 }
 
-
-/*
- *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
- *      chain, and is used for VS/NAT.
- *      It detects packets for VS/NAT connections and sends the packets
- *      immediately. This can avoid that iptable_nat mangles the packets
- *      for VS/NAT.
- */
-static unsigned int ip_vs_post_routing(unsigned int hooknum,
-				       struct sk_buff *skb,
-				       const struct net_device *in,
-				       const struct net_device *out,
-				       int (*okfn)(struct sk_buff *))
+__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 {
-	if (!skb->ipvs_property)
-		return NF_ACCEPT;
-	/* The packet was sent from IPVS, exit this chain */
-	return NF_STOP;
+	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
 }
 
-__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
+static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
 {
-	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
+	if (NF_INET_LOCAL_IN == hooknum)
+		return IP_DEFRAG_VS_IN;
+	if (NF_INET_FORWARD == hooknum)
+		return IP_DEFRAG_VS_FWD;
+	return IP_DEFRAG_VS_OUT;
 }
 
 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
@@ -609,10 +618,10 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 	if (inout)
-		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+		IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
 			"Forwarding altered outgoing ICMP");
 	else
-		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+		IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
 			"Forwarding altered incoming ICMP");
 }
 
@@ -646,17 +655,21 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
 	}
 
 	/* And finally the ICMP checksum */
-	icmph->icmp6_cksum = 0;
-	/* TODO IPv6: is this correct for ICMPv6? */
-	ip_vs_checksum_complete(skb, icmp_offset);
-	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
+					      skb->len - icmp_offset,
+					      IPPROTO_ICMPV6, 0);
+	skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
+	skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
+	skb->ip_summed = CHECKSUM_PARTIAL;
 
 	if (inout)
-		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-			"Forwarding altered outgoing ICMPv6");
+		IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+			      (void *)ciph - (void *)iph,
+			      "Forwarding altered outgoing ICMPv6");
 	else
-		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
-			"Forwarding altered incoming ICMPv6");
+		IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+			      (void *)ciph - (void *)iph,
+			      "Forwarding altered incoming ICMPv6");
 }
 #endif
 
@@ -697,10 +710,25 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
 #endif
 		ip_vs_nat_icmp(skb, pp, cp, 1);
 
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6) {
+		if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
+			goto out;
+	} else
+#endif
+		if ((sysctl_ip_vs_snat_reroute ||
+		     skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+		    ip_route_me_harder(skb, RTN_LOCAL) != 0)
+			goto out;
+
 	/* do the statistics and put it back */
 	ip_vs_out_stats(cp, skb);
 
 	skb->ipvs_property = 1;
+	if (!(cp->flags & IP_VS_CONN_F_NFCT))
+		ip_vs_notrack(skb);
+	else
+		ip_vs_update_conntrack(skb, cp, 0);
 	verdict = NF_ACCEPT;
 
 out:
@@ -714,7 +742,8 @@ out:
  *	Find any that might be relevant, check against existing connections.
  *	Currently handles error types - unreachable, quench, ttl exceeded.
  */
-static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
+static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
+			  unsigned int hooknum)
 {
 	struct iphdr *iph;
 	struct icmphdr	_icmph, *ic;
@@ -729,7 +758,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 
 	/* reassemble IP fragments */
 	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-		if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+		if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
 			return NF_STOLEN;
 	}
 
@@ -772,7 +801,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 		     pp->dont_defrag))
 		return NF_ACCEPT;
 
-	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
+	IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+		      "Checking outgoing ICMP for");
 
 	offset += cih->ihl * 4;
 
@@ -788,7 +818,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 }
 
 #ifdef CONFIG_IP_VS_IPV6
-static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
+static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
+			     unsigned int hooknum)
 {
 	struct ipv6hdr *iph;
 	struct icmp6hdr	_icmph, *ic;
@@ -804,7 +835,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
 
 	/* reassemble IP fragments */
 	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-		if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
+		if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
 			return NF_STOLEN;
 	}
 
@@ -847,7 +878,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
 	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
 		return NF_ACCEPT;
 
-	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
+	IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+		      "Checking outgoing ICMPv6 for");
 
 	offset += sizeof(struct ipv6hdr);
 
@@ -895,7 +927,7 @@ static unsigned int
 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		struct ip_vs_conn *cp, int ihl)
 {
-	IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
+	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
 
 	if (!skb_make_writable(skb, ihl))
 		goto drop;
@@ -914,6 +946,15 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		ip_send_check(ip_hdr(skb));
 	}
 
+	/*
+	 * nf_iterate does not expect change in the skb->dst->dev.
+	 * It looks like it is not fatal to enable this code for hooks
+	 * where our handlers are at the end of the chain list and
+	 * when all next handlers use skb->dst->dev and not outdev.
+	 * It will definitely route properly the inout NAT traffic
+	 * when multiple paths are used.
+	 */
+
 	/* For policy routing, packets originating from this
 	 * machine itself may be routed differently to packets
 	 * passing through.  We want this packet to be routed as
@@ -922,20 +963,25 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	 */
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
-		if (ip6_route_me_harder(skb) != 0)
+		if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
 			goto drop;
 	} else
 #endif
-		if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
+		if ((sysctl_ip_vs_snat_reroute ||
+		     skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+		    ip_route_me_harder(skb, RTN_LOCAL) != 0)
 			goto drop;
 
-	IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
+	IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
 
 	ip_vs_out_stats(cp, skb);
 	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
-	ip_vs_conn_put(cp);
-
 	skb->ipvs_property = 1;
+	if (!(cp->flags & IP_VS_CONN_F_NFCT))
+		ip_vs_notrack(skb);
+	else
+		ip_vs_update_conntrack(skb, cp, 0);
+	ip_vs_conn_put(cp);
 
 	LeaveFunction(11);
 	return NF_ACCEPT;
@@ -943,35 +989,46 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 drop:
 	ip_vs_conn_put(cp);
 	kfree_skb(skb);
+	LeaveFunction(11);
 	return NF_STOLEN;
 }
 
 /*
- *	It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
  *	Check if outgoing packet belongs to the established ip_vs_conn.
  */
 static unsigned int
-ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
-	  const struct net_device *in, const struct net_device *out,
-	  int (*okfn)(struct sk_buff *))
+ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
 {
 	struct ip_vs_iphdr iph;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_conn *cp;
-	int af;
 
 	EnterFunction(11);
 
-	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
+	/* Already marked as IPVS request or reply? */
 	if (skb->ipvs_property)
 		return NF_ACCEPT;
 
+	/* Bad... Do not break raw sockets */
+	if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+		     af == AF_INET)) {
+		struct sock *sk = skb->sk;
+		struct inet_sock *inet = inet_sk(skb->sk);
+
+		if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+			return NF_ACCEPT;
+	}
+
+	if (unlikely(!skb_dst(skb)))
+		return NF_ACCEPT;
+
 	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
 		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-			int related, verdict = ip_vs_out_icmp_v6(skb, &related);
+			int related;
+			int verdict = ip_vs_out_icmp_v6(skb, &related,
+							hooknum);
 
 			if (related)
 				return verdict;
@@ -980,7 +1037,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 	} else
 #endif
 		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
-			int related, verdict = ip_vs_out_icmp(skb, &related);
+			int related;
+			int verdict = ip_vs_out_icmp(skb, &related, hooknum);
 
 			if (related)
 				return verdict;
@@ -994,19 +1052,19 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 	/* reassemble IP fragments */
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
-		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-			int related, verdict = ip_vs_out_icmp_v6(skb, &related);
-
-			if (related)
-				return verdict;
-
-			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
+			if (ip_vs_gather_frags_v6(skb,
+						  ip_vs_defrag_user(hooknum)))
+				return NF_STOLEN;
 		}
+
+		ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 	} else
 #endif
 		if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
 			     !pp->dont_defrag)) {
-			if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
+			if (ip_vs_gather_frags(skb,
+					       ip_vs_defrag_user(hooknum)))
 				return NF_STOLEN;
 
 			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
@@ -1017,55 +1075,123 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 	 */
 	cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
 
-	if (unlikely(!cp)) {
-		if (sysctl_ip_vs_nat_icmp_send &&
-		    (pp->protocol == IPPROTO_TCP ||
-		     pp->protocol == IPPROTO_UDP ||
-		     pp->protocol == IPPROTO_SCTP)) {
-			__be16 _ports[2], *pptr;
-
-			pptr = skb_header_pointer(skb, iph.len,
-						  sizeof(_ports), _ports);
-			if (pptr == NULL)
-				return NF_ACCEPT;	/* Not for me */
-			if (ip_vs_lookup_real_service(af, iph.protocol,
-						      &iph.saddr,
-						      pptr[0])) {
-				/*
-				 * Notify the real server: there is no
-				 * existing entry if it is not RST
-				 * packet or not TCP packet.
-				 */
-				if ((iph.protocol != IPPROTO_TCP &&
-				     iph.protocol != IPPROTO_SCTP)
-				     || ((iph.protocol == IPPROTO_TCP
-					  && !is_tcp_reset(skb, iph.len))
-					 || (iph.protocol == IPPROTO_SCTP
-						&& !is_sctp_abort(skb,
-							iph.len)))) {
+	if (likely(cp))
+		return handle_response(af, skb, pp, cp, iph.len);
+	if (sysctl_ip_vs_nat_icmp_send &&
+	    (pp->protocol == IPPROTO_TCP ||
+	     pp->protocol == IPPROTO_UDP ||
+	     pp->protocol == IPPROTO_SCTP)) {
+		__be16 _ports[2], *pptr;
+
+		pptr = skb_header_pointer(skb, iph.len,
+					  sizeof(_ports), _ports);
+		if (pptr == NULL)
+			return NF_ACCEPT;	/* Not for me */
+		if (ip_vs_lookup_real_service(af, iph.protocol,
+					      &iph.saddr,
+					      pptr[0])) {
+			/*
+			 * Notify the real server: there is no
+			 * existing entry if it is not RST
+			 * packet or not TCP packet.
+			 */
+			if ((iph.protocol != IPPROTO_TCP &&
+			     iph.protocol != IPPROTO_SCTP)
+			     || ((iph.protocol == IPPROTO_TCP
+				  && !is_tcp_reset(skb, iph.len))
+				 || (iph.protocol == IPPROTO_SCTP
+					&& !is_sctp_abort(skb,
+						iph.len)))) {
 #ifdef CONFIG_IP_VS_IPV6
-					if (af == AF_INET6)
-						icmpv6_send(skb,
-							    ICMPV6_DEST_UNREACH,
-							    ICMPV6_PORT_UNREACH,
-							    0);
-					else
+				if (af == AF_INET6) {
+					struct net *net =
+						dev_net(skb_dst(skb)->dev);
+
+					if (!skb->dev)
+						skb->dev = net->loopback_dev;
+					icmpv6_send(skb,
+						    ICMPV6_DEST_UNREACH,
+						    ICMPV6_PORT_UNREACH,
+						    0);
+				} else
 #endif
-						icmp_send(skb,
-							  ICMP_DEST_UNREACH,
-							  ICMP_PORT_UNREACH, 0);
-					return NF_DROP;
-				}
+					icmp_send(skb,
+						  ICMP_DEST_UNREACH,
+						  ICMP_PORT_UNREACH, 0);
+				return NF_DROP;
 			}
 		}
-		IP_VS_DBG_PKT(12, pp, skb, 0,
-			      "packet continues traversal as normal");
-		return NF_ACCEPT;
 	}
+	IP_VS_DBG_PKT(12, af, pp, skb, 0,
+		      "ip_vs_out: packet continues traversal as normal");
+	return NF_ACCEPT;
+}
 
-	return handle_response(af, skb, pp, cp, iph.len);
+/*
+ *	It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ *	used only for VS/NAT.
+ *	Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
+	     const struct net_device *in, const struct net_device *out,
+	     int (*okfn)(struct sk_buff *))
+{
+	return ip_vs_out(hooknum, skb, AF_INET);
+}
+
+/*
+ *	It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ *	Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
+		   const struct net_device *in, const struct net_device *out,
+		   int (*okfn)(struct sk_buff *))
+{
+	unsigned int verdict;
+
+	/* Disable BH in LOCAL_OUT until all places are fixed */
+	local_bh_disable();
+	verdict = ip_vs_out(hooknum, skb, AF_INET);
+	local_bh_enable();
+	return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ *	It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ *	used only for VS/NAT.
+ *	Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
+	     const struct net_device *in, const struct net_device *out,
+	     int (*okfn)(struct sk_buff *))
+{
+	return ip_vs_out(hooknum, skb, AF_INET6);
+}
+
+/*
+ *	It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ *	Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
+		   const struct net_device *in, const struct net_device *out,
+		   int (*okfn)(struct sk_buff *))
+{
+	unsigned int verdict;
+
+	/* Disable BH in LOCAL_OUT until all places are fixed */
+	local_bh_disable();
+	verdict = ip_vs_out(hooknum, skb, AF_INET6);
+	local_bh_enable();
+	return verdict;
 }
 
+#endif
 
 /*
  *	Handle ICMP messages in the outside-to-inside direction (incoming).
@@ -1089,8 +1215,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 
 	/* reassemble IP fragments */
 	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-		if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
-					    IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
+		if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
 			return NF_STOLEN;
 	}
 
@@ -1133,7 +1258,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 		     pp->dont_defrag))
 		return NF_ACCEPT;
 
-	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
+	IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+		      "Checking incoming ICMP for");
 
 	offset += cih->ihl * 4;
 
@@ -1167,7 +1293,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
 		offset += 2 * sizeof(__u16);
 	verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
-	/* do not touch skb anymore */
+	/* LOCALNODE from FORWARD hook is not supported */
+	if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
+	    skb_rtable(skb)->rt_flags & RTCF_LOCAL) {
+		IP_VS_DBG(1, "%s(): "
+			  "local delivery to %pI4 but in FORWARD\n",
+			  __func__, &skb_rtable(skb)->rt_dst);
+		verdict = NF_DROP;
+	}
 
   out:
 	__ip_vs_conn_put(cp);
@@ -1188,14 +1321,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 	struct ip_vs_protocol *pp;
 	unsigned int offset, verdict;
 	union nf_inet_addr snet;
+	struct rt6_info *rt;
 
 	*related = 1;
 
 	/* reassemble IP fragments */
 	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-		if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
-					       IP_DEFRAG_VS_IN :
-					       IP_DEFRAG_VS_FWD))
+		if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
 			return NF_STOLEN;
 	}
 
@@ -1238,7 +1370,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
 		return NF_ACCEPT;
 
-	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
+	IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
+		      "Checking incoming ICMPv6 for");
 
 	offset += sizeof(struct ipv6hdr);
 
@@ -1266,7 +1399,15 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 	    IPPROTO_SCTP == cih->nexthdr)
 		offset += 2 * sizeof(__u16);
 	verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
-	/* do not touch skb anymore */
+	/* LOCALNODE from FORWARD hook is not supported */
+	if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
+	    (rt = (struct rt6_info *) skb_dst(skb)) &&
+	    rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) {
+		IP_VS_DBG(1, "%s(): "
+			  "local delivery to %pI6 but in FORWARD\n",
+			  __func__, &rt->rt6i_dst);
+		verdict = NF_DROP;
+	}
 
 	__ip_vs_conn_put(cp);
 
@@ -1280,35 +1421,49 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
  *	and send it on its way...
  */
 static unsigned int
-ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
-	 const struct net_device *in, const struct net_device *out,
-	 int (*okfn)(struct sk_buff *))
+ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 {
 	struct ip_vs_iphdr iph;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_conn *cp;
-	int ret, restart, af, pkts;
+	int ret, restart, pkts;
 
-	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
-	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+	/* Already marked as IPVS request or reply? */
+	if (skb->ipvs_property)
+		return NF_ACCEPT;
 
 	/*
-	 *	Big tappo: only PACKET_HOST, including loopback for local client
-	 *	Don't handle local packets on IPv6 for now
+	 *	Big tappo:
+	 *	- remote client: only PACKET_HOST
+	 *	- route: used for struct net when skb->dev is unset
 	 */
-	if (unlikely(skb->pkt_type != PACKET_HOST)) {
-		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
-			      skb->pkt_type,
-			      iph.protocol,
-			      IP_VS_DBG_ADDR(af, &iph.daddr));
+	if (unlikely((skb->pkt_type != PACKET_HOST &&
+		      hooknum != NF_INET_LOCAL_OUT) ||
+		     !skb_dst(skb))) {
+		ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
+			      " ignored in hook %u\n",
+			      skb->pkt_type, iph.protocol,
+			      IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
 		return NF_ACCEPT;
 	}
+	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+	/* Bad... Do not break raw sockets */
+	if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+		     af == AF_INET)) {
+		struct sock *sk = skb->sk;
+		struct inet_sock *inet = inet_sk(skb->sk);
+
+		if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+			return NF_ACCEPT;
+	}
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6) {
 		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
-			int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
+			int related;
+			int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
 
 			if (related)
 				return verdict;
@@ -1317,7 +1472,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	} else
 #endif
 		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
-			int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
+			int related;
+			int verdict = ip_vs_in_icmp(skb, &related, hooknum);
 
 			if (related)
 				return verdict;
@@ -1337,23 +1493,18 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	if (unlikely(!cp)) {
 		int v;
 
-		/* For local client packets, it could be a response */
-		cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
-		if (cp)
-			return handle_response(af, skb, pp, cp, iph.len);
-
 		if (!pp->conn_schedule(af, skb, pp, &v, &cp))
 			return v;
 	}
 
 	if (unlikely(!cp)) {
 		/* sorry, all this trouble for a no-hit :) */
-		IP_VS_DBG_PKT(12, pp, skb, 0,
-			      "packet continues traversal as normal");
+		IP_VS_DBG_PKT(12, af, pp, skb, 0,
+			      "ip_vs_in: packet continues traversal as normal");
 		return NF_ACCEPT;
 	}
 
-	IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
+	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
 
 	/* Check the server status */
 	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
@@ -1389,8 +1540,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 	if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
 	    cp->protocol == IPPROTO_SCTP) {
 		if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
-			(atomic_read(&cp->in_pkts) %
-			 sysctl_ip_vs_sync_threshold[1]
+			(pkts % sysctl_ip_vs_sync_threshold[1]
 			 == sysctl_ip_vs_sync_threshold[0])) ||
 				(cp->old_state != cp->state &&
 				 ((cp->state == IP_VS_SCTP_S_CLOSED) ||
@@ -1401,7 +1551,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
 		}
 	}
 
-	if (af == AF_INET &&
+	/* Keep this block last: TCP and others with pp->num_states <= 1 */
+	else if (af == AF_INET &&
 	    (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
 	    (((cp->protocol != IPPROTO_TCP ||
 	       cp->state == IP_VS_TCP_S_ESTABLISHED) &&
@@ -1420,6 +1571,72 @@ out:
 	return ret;
 }
 
+/*
+ *	AF_INET handler in NF_INET_LOCAL_IN chain
+ *	Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      int (*okfn)(struct sk_buff *))
+{
+	return ip_vs_in(hooknum, skb, AF_INET);
+}
+
+/*
+ *	AF_INET handler in NF_INET_LOCAL_OUT chain
+ *	Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
+		     const struct net_device *in, const struct net_device *out,
+		     int (*okfn)(struct sk_buff *))
+{
+	unsigned int verdict;
+
+	/* Disable BH in LOCAL_OUT until all places are fixed */
+	local_bh_disable();
+	verdict = ip_vs_in(hooknum, skb, AF_INET);
+	local_bh_enable();
+	return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ *	AF_INET6 handler in NF_INET_LOCAL_IN chain
+ *	Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      int (*okfn)(struct sk_buff *))
+{
+	return ip_vs_in(hooknum, skb, AF_INET6);
+}
+
+/*
+ *	AF_INET6 handler in NF_INET_LOCAL_OUT chain
+ *	Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
+		     const struct net_device *in, const struct net_device *out,
+		     int (*okfn)(struct sk_buff *))
+{
+	unsigned int verdict;
+
+	/* Disable BH in LOCAL_OUT until all places are fixed */
+	local_bh_disable();
+	verdict = ip_vs_in(hooknum, skb, AF_INET6);
+	local_bh_enable();
+	return verdict;
+}
+
+#endif
+
 
 /*
  *	It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
@@ -1460,23 +1677,39 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
 
 
 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_reply4,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= 99,
+	},
 	/* After packet filtering, forward packet through VS/DR, VS/TUN,
 	 * or VS/NAT(change destination), so that filtering rules can be
 	 * applied to IPVS. */
 	{
-		.hook		= ip_vs_in,
+		.hook		= ip_vs_remote_request4,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET,
-		.hooknum        = NF_INET_LOCAL_IN,
-		.priority       = 100,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= 101,
 	},
-	/* After packet filtering, change source only for VS/NAT */
+	/* Before ip_vs_in, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_local_reply4,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= -99,
+	},
+	/* After mangle, schedule and forward local requests */
 	{
-		.hook		= ip_vs_out,
+		.hook		= ip_vs_local_request4,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET,
-		.hooknum        = NF_INET_FORWARD,
-		.priority       = 100,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= -98,
 	},
 	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
 	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1484,35 +1717,51 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hook		= ip_vs_forward_icmp,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET,
-		.hooknum        = NF_INET_FORWARD,
-		.priority       = 99,
+		.hooknum	= NF_INET_FORWARD,
+		.priority	= 99,
 	},
-	/* Before the netfilter connection tracking, exit from POST_ROUTING */
+	/* After packet filtering, change source only for VS/NAT */
 	{
-		.hook		= ip_vs_post_routing,
+		.hook		= ip_vs_reply4,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET,
-		.hooknum        = NF_INET_POST_ROUTING,
-		.priority       = NF_IP_PRI_NAT_SRC-1,
+		.hooknum	= NF_INET_FORWARD,
+		.priority	= 100,
 	},
 #ifdef CONFIG_IP_VS_IPV6
+	/* After packet filtering, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_reply6,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET6,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= 99,
+	},
 	/* After packet filtering, forward packet through VS/DR, VS/TUN,
 	 * or VS/NAT(change destination), so that filtering rules can be
 	 * applied to IPVS. */
 	{
-		.hook		= ip_vs_in,
+		.hook		= ip_vs_remote_request6,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET6,
-		.hooknum        = NF_INET_LOCAL_IN,
-		.priority       = 100,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= 101,
 	},
-	/* After packet filtering, change source only for VS/NAT */
+	/* Before ip_vs_in, change source only for VS/NAT */
+	{
+		.hook		= ip_vs_local_reply6,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= -99,
+	},
+	/* After mangle, schedule and forward local requests */
 	{
-		.hook		= ip_vs_out,
+		.hook		= ip_vs_local_request6,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET6,
-		.hooknum        = NF_INET_FORWARD,
-		.priority       = 100,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= -98,
 	},
 	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
 	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
@@ -1520,16 +1769,16 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
 		.hook		= ip_vs_forward_icmp_v6,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET6,
-		.hooknum        = NF_INET_FORWARD,
-		.priority       = 99,
+		.hooknum	= NF_INET_FORWARD,
+		.priority	= 99,
 	},
-	/* Before the netfilter connection tracking, exit from POST_ROUTING */
+	/* After packet filtering, change source only for VS/NAT */
 	{
-		.hook		= ip_vs_post_routing,
+		.hook		= ip_vs_reply6,
 		.owner		= THIS_MODULE,
 		.pf		= PF_INET6,
-		.hooknum        = NF_INET_POST_ROUTING,
-		.priority       = NF_IP6_PRI_NAT_SRC-1,
+		.hooknum	= NF_INET_FORWARD,
+		.priority	= 100,
 	},
 #endif
 };
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 36dc1d88c2f..5f5daa30b0a 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -61,7 +61,7 @@ static DEFINE_RWLOCK(__ip_vs_svc_lock);
 static DEFINE_RWLOCK(__ip_vs_rs_lock);
 
 /* lock for state and timeout tables */
-static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
+static DEFINE_SPINLOCK(ip_vs_securetcp_lock);
 
 /* lock for drop entry handling */
 static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
@@ -88,6 +88,10 @@ int sysctl_ip_vs_expire_nodest_conn = 0;
 int sysctl_ip_vs_expire_quiescent_template = 0;
 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
 int sysctl_ip_vs_nat_icmp_send = 0;
+#ifdef CONFIG_IP_VS_NFCT
+int sysctl_ip_vs_conntrack;
+#endif
+int sysctl_ip_vs_snat_reroute = 1;
 
 
 #ifdef CONFIG_IP_VS_DEBUG
@@ -204,7 +208,7 @@ static void update_defense_level(void)
 	spin_unlock(&__ip_vs_droppacket_lock);
 
 	/* secure_tcp */
-	write_lock(&__ip_vs_securetcp_lock);
+	spin_lock(&ip_vs_securetcp_lock);
 	switch (sysctl_ip_vs_secure_tcp) {
 	case 0:
 		if (old_secure_tcp >= 2)
@@ -238,7 +242,7 @@ static void update_defense_level(void)
 	old_secure_tcp = sysctl_ip_vs_secure_tcp;
 	if (to_change >= 0)
 		ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
-	write_unlock(&__ip_vs_securetcp_lock);
+	spin_unlock(&ip_vs_securetcp_lock);
 
 	local_bh_enable();
 }
@@ -401,7 +405,7 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
  *	Get service by {proto,addr,port} in the service table.
  */
 static inline struct ip_vs_service *
-__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
+__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
 		    __be16 vport)
 {
 	unsigned hash;
@@ -416,7 +420,6 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
 		    && (svc->port == vport)
 		    && (svc->protocol == protocol)) {
 			/* HIT */
-			atomic_inc(&svc->usecnt);
 			return svc;
 		}
 	}
@@ -429,7 +432,7 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
  *	Get service by {fwmark} in the service table.
  */
 static inline struct ip_vs_service *
-__ip_vs_svc_fwm_get(int af, __u32 fwmark)
+__ip_vs_svc_fwm_find(int af, __u32 fwmark)
 {
 	unsigned hash;
 	struct ip_vs_service *svc;
@@ -440,7 +443,6 @@ __ip_vs_svc_fwm_get(int af, __u32 fwmark)
 	list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
 		if (svc->fwmark == fwmark && svc->af == af) {
 			/* HIT */
-			atomic_inc(&svc->usecnt);
 			return svc;
 		}
 	}
@@ -459,14 +461,14 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
 	/*
 	 *	Check the table hashed by fwmark first
 	 */
-	if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark)))
+	if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark)))
 		goto out;
 
 	/*
 	 *	Check the table hashed by <protocol,addr,port>
 	 *	for "full" addressed entries
 	 */
-	svc = __ip_vs_service_get(af, protocol, vaddr, vport);
+	svc = __ip_vs_service_find(af, protocol, vaddr, vport);
 
 	if (svc == NULL
 	    && protocol == IPPROTO_TCP
@@ -476,7 +478,7 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
 		 * Check if ftp service entry exists, the packet
 		 * might belong to FTP data connections.
 		 */
-		svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT);
+		svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT);
 	}
 
 	if (svc == NULL
@@ -484,10 +486,12 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
 		/*
 		 * Check if the catch-all port (port zero) exists
 		 */
-		svc = __ip_vs_service_get(af, protocol, vaddr, 0);
+		svc = __ip_vs_service_find(af, protocol, vaddr, 0);
 	}
 
   out:
+	if (svc)
+		atomic_inc(&svc->usecnt);
 	read_unlock(&__ip_vs_svc_lock);
 
 	IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
@@ -506,14 +510,19 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
 	dest->svc = svc;
 }
 
-static inline void
+static void
 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
 {
 	struct ip_vs_service *svc = dest->svc;
 
 	dest->svc = NULL;
-	if (atomic_dec_and_test(&svc->refcnt))
+	if (atomic_dec_and_test(&svc->refcnt)) {
+		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+			      svc->fwmark,
+			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
+			      ntohs(svc->port), atomic_read(&svc->usecnt));
 		kfree(svc);
+	}
 }
 
 
@@ -758,31 +767,18 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
  *	Update a destination in the given service
  */
 static void
-__ip_vs_update_dest(struct ip_vs_service *svc,
-		    struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest)
+__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
+		    struct ip_vs_dest_user_kern *udest, int add)
 {
 	int conn_flags;
 
 	/* set the weight and the flags */
 	atomic_set(&dest->weight, udest->weight);
-	conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
-
-	/* check if local node and update the flags */
-#ifdef CONFIG_IP_VS_IPV6
-	if (svc->af == AF_INET6) {
-		if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
-			conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
-				| IP_VS_CONN_F_LOCALNODE;
-		}
-	} else
-#endif
-		if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
-			conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
-				| IP_VS_CONN_F_LOCALNODE;
-		}
+	conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
+	conn_flags |= IP_VS_CONN_F_INACTIVE;
 
 	/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
-	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
+	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
 		conn_flags |= IP_VS_CONN_F_NOOUTPUT;
 	} else {
 		/*
@@ -813,6 +809,29 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
 		dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
 	dest->u_threshold = udest->u_threshold;
 	dest->l_threshold = udest->l_threshold;
+
+	spin_lock(&dest->dst_lock);
+	ip_vs_dst_reset(dest);
+	spin_unlock(&dest->dst_lock);
+
+	if (add)
+		ip_vs_new_estimator(&dest->stats);
+
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	/* Wait until all other svc users go away */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+
+	if (add) {
+		list_add(&dest->n_list, &svc->destinations);
+		svc->num_dests++;
+	}
+
+	/* call the update_service, because server weight may be changed */
+	if (svc->scheduler->update_service)
+		svc->scheduler->update_service(svc);
+
+	write_unlock_bh(&__ip_vs_svc_lock);
 }
 
 
@@ -843,7 +862,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
 			return -EINVAL;
 	}
 
-	dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
+	dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
 	if (dest == NULL) {
 		pr_err("%s(): no memory.\n", __func__);
 		return -ENOMEM;
@@ -860,13 +879,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
 	atomic_set(&dest->activeconns, 0);
 	atomic_set(&dest->inactconns, 0);
 	atomic_set(&dest->persistconns, 0);
-	atomic_set(&dest->refcnt, 0);
+	atomic_set(&dest->refcnt, 1);
 
 	INIT_LIST_HEAD(&dest->d_list);
 	spin_lock_init(&dest->dst_lock);
 	spin_lock_init(&dest->stats.lock);
-	__ip_vs_update_dest(svc, dest, udest);
-	ip_vs_new_estimator(&dest->stats);
+	__ip_vs_update_dest(svc, dest, udest, 1);
 
 	*dest_p = dest;
 
@@ -926,65 +944,22 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 			      IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
 			      ntohs(dest->vport));
 
-		__ip_vs_update_dest(svc, dest, udest);
-
 		/*
 		 * Get the destination from the trash
 		 */
 		list_del(&dest->n_list);
 
-		ip_vs_new_estimator(&dest->stats);
-
-		write_lock_bh(&__ip_vs_svc_lock);
-
+		__ip_vs_update_dest(svc, dest, udest, 1);
+		ret = 0;
+	} else {
 		/*
-		 * Wait until all other svc users go away.
+		 * Allocate and initialize the dest structure
 		 */
-		IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-		list_add(&dest->n_list, &svc->destinations);
-		svc->num_dests++;
-
-		/* call the update_service function of its scheduler */
-		if (svc->scheduler->update_service)
-			svc->scheduler->update_service(svc);
-
-		write_unlock_bh(&__ip_vs_svc_lock);
-		return 0;
-	}
-
-	/*
-	 * Allocate and initialize the dest structure
-	 */
-	ret = ip_vs_new_dest(svc, udest, &dest);
-	if (ret) {
-		return ret;
+		ret = ip_vs_new_dest(svc, udest, &dest);
 	}
-
-	/*
-	 * Add the dest entry into the list
-	 */
-	atomic_inc(&dest->refcnt);
-
-	write_lock_bh(&__ip_vs_svc_lock);
-
-	/*
-	 * Wait until all other svc users go away.
-	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-	list_add(&dest->n_list, &svc->destinations);
-	svc->num_dests++;
-
-	/* call the update_service function of its scheduler */
-	if (svc->scheduler->update_service)
-		svc->scheduler->update_service(svc);
-
-	write_unlock_bh(&__ip_vs_svc_lock);
-
 	LeaveFunction(2);
 
-	return 0;
+	return ret;
 }
 
 
@@ -1023,19 +998,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 		return -ENOENT;
 	}
 
-	__ip_vs_update_dest(svc, dest, udest);
-
-	write_lock_bh(&__ip_vs_svc_lock);
-
-	/* Wait until all other svc users go away */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
-
-	/* call the update_service, because server weight may be changed */
-	if (svc->scheduler->update_service)
-		svc->scheduler->update_service(svc);
-
-	write_unlock_bh(&__ip_vs_svc_lock);
-
+	__ip_vs_update_dest(svc, dest, udest, 0);
 	LeaveFunction(2);
 
 	return 0;
@@ -1062,6 +1025,10 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
 	 *  the destination into the trash.
 	 */
 	if (atomic_dec_and_test(&dest->refcnt)) {
+		IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
+			      dest->vfwmark,
+			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
+			      ntohs(dest->port));
 		ip_vs_dst_reset(dest);
 		/* simply decrease svc->refcnt here, let the caller check
 		   and release the service if nobody refers to it.
@@ -1128,7 +1095,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	/*
 	 *	Wait until all other svc users go away.
 	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
 	/*
 	 *	Unlink dest from the service
@@ -1157,6 +1124,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 {
 	int ret = 0;
 	struct ip_vs_scheduler *sched = NULL;
+	struct ip_vs_pe *pe = NULL;
 	struct ip_vs_service *svc = NULL;
 
 	/* increase the module use count */
@@ -1167,7 +1135,17 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 	if (sched == NULL) {
 		pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
 		ret = -ENOENT;
-		goto out_mod_dec;
+		goto out_err;
+	}
+
+	if (u->pe_name && *u->pe_name) {
+		pe = ip_vs_pe_get(u->pe_name);
+		if (pe == NULL) {
+			pr_info("persistence engine module ip_vs_pe_%s "
+				"not found\n", u->pe_name);
+			ret = -ENOENT;
+			goto out_err;
+		}
 	}
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -1177,7 +1155,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 	}
 #endif
 
-	svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
+	svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
 	if (svc == NULL) {
 		IP_VS_DBG(1, "%s(): no memory\n", __func__);
 		ret = -ENOMEM;
@@ -1185,7 +1163,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 	}
 
 	/* I'm the first user of the service */
-	atomic_set(&svc->usecnt, 1);
+	atomic_set(&svc->usecnt, 0);
 	atomic_set(&svc->refcnt, 0);
 
 	svc->af = u->af;
@@ -1207,6 +1185,10 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 		goto out_err;
 	sched = NULL;
 
+	/* Bind the ct retriever */
+	ip_vs_bind_pe(svc, pe);
+	pe = NULL;
+
 	/* Update the virtual service counters */
 	if (svc->port == FTPPORT)
 		atomic_inc(&ip_vs_ftpsvc_counter);
@@ -1227,10 +1209,9 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 	*svc_p = svc;
 	return 0;
 
-  out_err:
+ out_err:
 	if (svc != NULL) {
-		if (svc->scheduler)
-			ip_vs_unbind_scheduler(svc);
+		ip_vs_unbind_scheduler(svc);
 		if (svc->inc) {
 			local_bh_disable();
 			ip_vs_app_inc_put(svc->inc);
@@ -1239,8 +1220,8 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
 		kfree(svc);
 	}
 	ip_vs_scheduler_put(sched);
+	ip_vs_pe_put(pe);
 
-  out_mod_dec:
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
 
@@ -1255,6 +1236,7 @@ static int
 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 {
 	struct ip_vs_scheduler *sched, *old_sched;
+	struct ip_vs_pe *pe = NULL, *old_pe = NULL;
 	int ret = 0;
 
 	/*
@@ -1267,6 +1249,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 	}
 	old_sched = sched;
 
+	if (u->pe_name && *u->pe_name) {
+		pe = ip_vs_pe_get(u->pe_name);
+		if (pe == NULL) {
+			pr_info("persistence engine module ip_vs_pe_%s "
+				"not found\n", u->pe_name);
+			ret = -ENOENT;
+			goto out;
+		}
+		old_pe = pe;
+	}
+
 #ifdef CONFIG_IP_VS_IPV6
 	if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
 		ret = -EINVAL;
@@ -1279,7 +1272,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 	/*
 	 * Wait until all other svc users go away.
 	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
 	/*
 	 * Set the flags and timeout value
@@ -1318,15 +1311,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 		}
 	}
 
+	old_pe = svc->pe;
+	if (pe != old_pe) {
+		ip_vs_unbind_pe(svc);
+		ip_vs_bind_pe(svc, pe);
+	}
+
   out_unlock:
 	write_unlock_bh(&__ip_vs_svc_lock);
-#ifdef CONFIG_IP_VS_IPV6
   out:
-#endif
-
-	if (old_sched)
-		ip_vs_scheduler_put(old_sched);
-
+	ip_vs_scheduler_put(old_sched);
+	ip_vs_pe_put(old_pe);
 	return ret;
 }
 
@@ -1340,6 +1335,9 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
 {
 	struct ip_vs_dest *dest, *nxt;
 	struct ip_vs_scheduler *old_sched;
+	struct ip_vs_pe *old_pe;
+
+	pr_info("%s: enter\n", __func__);
 
 	/* Count only IPv4 services for old get/setsockopt interface */
 	if (svc->af == AF_INET)
@@ -1350,8 +1348,12 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
 	/* Unbind scheduler */
 	old_sched = svc->scheduler;
 	ip_vs_unbind_scheduler(svc);
-	if (old_sched)
-		ip_vs_scheduler_put(old_sched);
+	ip_vs_scheduler_put(old_sched);
+
+	/* Unbind persistence engine */
+	old_pe = svc->pe;
+	ip_vs_unbind_pe(svc);
+	ip_vs_pe_put(old_pe);
 
 	/* Unbind app inc */
 	if (svc->inc) {
@@ -1378,21 +1380,23 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
 	/*
 	 *    Free the service if nobody refers to it
 	 */
-	if (atomic_read(&svc->refcnt) == 0)
+	if (atomic_read(&svc->refcnt) == 0) {
+		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+			      svc->fwmark,
+			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
+			      ntohs(svc->port), atomic_read(&svc->usecnt));
 		kfree(svc);
+	}
 
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
 }
 
 /*
- *	Delete a service from the service list
+ * Unlink a service from list and try to delete it if its refcnt reached 0
  */
-static int ip_vs_del_service(struct ip_vs_service *svc)
+static void ip_vs_unlink_service(struct ip_vs_service *svc)
 {
-	if (svc == NULL)
-		return -EEXIST;
-
 	/*
 	 * Unhash it from the service table
 	 */
@@ -1403,11 +1407,21 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
 	/*
 	 * Wait until all the svc users go away.
 	 */
-	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
 
 	__ip_vs_del_service(svc);
 
 	write_unlock_bh(&__ip_vs_svc_lock);
+}
+
+/*
+ *	Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+	if (svc == NULL)
+		return -EEXIST;
+	ip_vs_unlink_service(svc);
 
 	return 0;
 }
@@ -1426,14 +1440,7 @@ static int ip_vs_flush(void)
 	 */
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
-			write_lock_bh(&__ip_vs_svc_lock);
-			ip_vs_svc_unhash(svc);
-			/*
-			 * Wait until all the svc users go away.
-			 */
-			IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-			__ip_vs_del_service(svc);
-			write_unlock_bh(&__ip_vs_svc_lock);
+			ip_vs_unlink_service(svc);
 		}
 	}
 
@@ -1443,14 +1450,7 @@ static int ip_vs_flush(void)
 	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		list_for_each_entry_safe(svc, nxt,
 					 &ip_vs_svc_fwm_table[idx], f_list) {
-			write_lock_bh(&__ip_vs_svc_lock);
-			ip_vs_svc_unhash(svc);
-			/*
-			 * Wait until all the svc users go away.
-			 */
-			IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-			__ip_vs_del_service(svc);
-			write_unlock_bh(&__ip_vs_svc_lock);
+			ip_vs_unlink_service(svc);
 		}
 	}
 
@@ -1579,6 +1579,15 @@ static struct ctl_table vs_vars[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_do_defense_mode,
 	},
+#ifdef CONFIG_IP_VS_NFCT
+	{
+		.procname	= "conntrack",
+		.data		= &sysctl_ip_vs_conntrack,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{
 		.procname	= "secure_tcp",
 		.data		= &sysctl_ip_vs_secure_tcp,
@@ -1586,6 +1595,13 @@ static struct ctl_table vs_vars[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_do_defense_mode,
 	},
+	{
+		.procname	= "snat_reroute",
+		.data		= &sysctl_ip_vs_snat_reroute,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #if 0
 	{
 		.procname	= "timeout_established",
@@ -1864,14 +1880,16 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
 					   svc->scheduler->name);
 			else
 #endif
-				seq_printf(seq, "%s  %08X:%04X %s ",
+				seq_printf(seq, "%s  %08X:%04X %s %s ",
 					   ip_vs_proto_name(svc->protocol),
 					   ntohl(svc->addr.ip),
 					   ntohs(svc->port),
-					   svc->scheduler->name);
+					   svc->scheduler->name,
+					   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
 		} else {
-			seq_printf(seq, "FWM  %08X %s ",
-				   svc->fwmark, svc->scheduler->name);
+			seq_printf(seq, "FWM  %08X %s %s",
+				   svc->fwmark, svc->scheduler->name,
+				   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
 		}
 
 		if (svc->flags & IP_VS_SVC_F_PERSISTENT)
@@ -2039,6 +2057,8 @@ static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
 				  struct ip_vs_service_user *usvc_compat)
 {
+	memset(usvc, 0, sizeof(*usvc));
+
 	usvc->af		= AF_INET;
 	usvc->protocol		= usvc_compat->protocol;
 	usvc->addr.ip		= usvc_compat->addr;
@@ -2056,6 +2076,8 @@ static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
 				   struct ip_vs_dest_user *udest_compat)
 {
+	memset(udest, 0, sizeof(*udest));
+
 	udest->addr.ip		= udest_compat->addr;
 	udest->port		= udest_compat->port;
 	udest->conn_flags	= udest_compat->conn_flags;
@@ -2145,10 +2167,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 
 	/* Lookup the exact service by <protocol, addr, port> or fwmark */
 	if (usvc.fwmark == 0)
-		svc = __ip_vs_service_get(usvc.af, usvc.protocol,
-					  &usvc.addr, usvc.port);
+		svc = __ip_vs_service_find(usvc.af, usvc.protocol,
+					   &usvc.addr, usvc.port);
 	else
-		svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
+		svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark);
 
 	if (cmd != IP_VS_SO_SET_ADD
 	    && (svc == NULL || svc->protocol != usvc.protocol)) {
@@ -2187,9 +2209,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 		ret = -EINVAL;
 	}
 
-	if (svc)
-		ip_vs_service_put(svc);
-
   out_unlock:
 	mutex_unlock(&__ip_vs_mutex);
   out_dec:
@@ -2282,10 +2301,10 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
 	int ret = 0;
 
 	if (get->fwmark)
-		svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark);
+		svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark);
 	else
-		svc = __ip_vs_service_get(AF_INET, get->protocol, &addr,
-					  get->port);
+		svc = __ip_vs_service_find(AF_INET, get->protocol, &addr,
+					   get->port);
 
 	if (svc) {
 		int count = 0;
@@ -2313,7 +2332,6 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
 			}
 			count++;
 		}
-		ip_vs_service_put(svc);
 	} else
 		ret = -ESRCH;
 	return ret;
@@ -2434,15 +2452,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 		entry = (struct ip_vs_service_entry *)arg;
 		addr.ip = entry->addr;
 		if (entry->fwmark)
-			svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark);
+			svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark);
 		else
-			svc = __ip_vs_service_get(AF_INET, entry->protocol,
-						  &addr, entry->port);
+			svc = __ip_vs_service_find(AF_INET, entry->protocol,
+						   &addr, entry->port);
 		if (svc) {
 			ip_vs_copy_service(entry, svc);
 			if (copy_to_user(user, entry, sizeof(*entry)) != 0)
 				ret = -EFAULT;
-			ip_vs_service_put(svc);
 		} else
 			ret = -ESRCH;
 	}
@@ -2557,6 +2574,8 @@ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
 	[IPVS_SVC_ATTR_FWMARK]		= { .type = NLA_U32 },
 	[IPVS_SVC_ATTR_SCHED_NAME]	= { .type = NLA_NUL_STRING,
 					    .len = IP_VS_SCHEDNAME_MAXLEN },
+	[IPVS_SVC_ATTR_PE_NAME]		= { .type = NLA_NUL_STRING,
+					    .len = IP_VS_PENAME_MAXLEN },
 	[IPVS_SVC_ATTR_FLAGS]		= { .type = NLA_BINARY,
 					    .len = sizeof(struct ip_vs_flags) },
 	[IPVS_SVC_ATTR_TIMEOUT]		= { .type = NLA_U32 },
@@ -2633,6 +2652,8 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
 	}
 
 	NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
+	if (svc->pe)
+		NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
 	NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
 	NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
 	NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
@@ -2709,10 +2730,12 @@ nla_put_failure:
 }
 
 static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
-				    struct nlattr *nla, int full_entry)
+				    struct nlattr *nla, int full_entry,
+				    struct ip_vs_service **ret_svc)
 {
 	struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
 	struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
+	struct ip_vs_service *svc;
 
 	/* Parse mandatory identifying service fields first */
 	if (nla == NULL ||
@@ -2748,14 +2771,21 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 		usvc->fwmark = 0;
 	}
 
+	if (usvc->fwmark)
+		svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark);
+	else
+		svc = __ip_vs_service_find(usvc->af, usvc->protocol,
+					   &usvc->addr, usvc->port);
+	*ret_svc = svc;
+
 	/* If a full entry was requested, check for the additional fields */
 	if (full_entry) {
-		struct nlattr *nla_sched, *nla_flags, *nla_timeout,
+		struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
 			      *nla_netmask;
 		struct ip_vs_flags flags;
-		struct ip_vs_service *svc;
 
 		nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
+		nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
 		nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
 		nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
 		nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
@@ -2766,21 +2796,14 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 		nla_memcpy(&flags, nla_flags, sizeof(flags));
 
 		/* prefill flags from service if it already exists */
-		if (usvc->fwmark)
-			svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
-		else
-			svc = __ip_vs_service_get(usvc->af, usvc->protocol,
-						  &usvc->addr, usvc->port);
-		if (svc) {
+		if (svc)
 			usvc->flags = svc->flags;
-			ip_vs_service_put(svc);
-		} else
-			usvc->flags = 0;
 
 		/* set new flags from userland */
 		usvc->flags = (usvc->flags & ~flags.mask) |
 			      (flags.flags & flags.mask);
 		usvc->sched_name = nla_data(nla_sched);
+		usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
 		usvc->timeout = nla_get_u32(nla_timeout);
 		usvc->netmask = nla_get_u32(nla_netmask);
 	}
@@ -2791,17 +2814,11 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
 static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
 {
 	struct ip_vs_service_user_kern usvc;
+	struct ip_vs_service *svc;
 	int ret;
 
-	ret = ip_vs_genl_parse_service(&usvc, nla, 0);
-	if (ret)
-		return ERR_PTR(ret);
-
-	if (usvc.fwmark)
-		return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-	else
-		return __ip_vs_service_get(usvc.af, usvc.protocol,
-					   &usvc.addr, usvc.port);
+	ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc);
+	return ret ? ERR_PTR(ret) : svc;
 }
 
 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
@@ -2892,7 +2909,6 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
 
 nla_put_failure:
 	cb->args[0] = idx;
-	ip_vs_service_put(svc);
 
 out_err:
 	mutex_unlock(&__ip_vs_mutex);
@@ -3105,17 +3121,10 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 
 	ret = ip_vs_genl_parse_service(&usvc,
 				       info->attrs[IPVS_CMD_ATTR_SERVICE],
-				       need_full_svc);
+				       need_full_svc, &svc);
 	if (ret)
 		goto out;
 
-	/* Lookup the exact service by <protocol, addr, port> or fwmark */
-	if (usvc.fwmark == 0)
-		svc = __ip_vs_service_get(usvc.af, usvc.protocol,
-					  &usvc.addr, usvc.port);
-	else
-		svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
-
 	/* Unless we're adding a new service, the service must already exist */
 	if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
 		ret = -ESRCH;
@@ -3149,6 +3158,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 		break;
 	case IPVS_CMD_DEL_SERVICE:
 		ret = ip_vs_del_service(svc);
+		/* do not use svc, it can be freed */
 		break;
 	case IPVS_CMD_NEW_DEST:
 		ret = ip_vs_add_dest(svc, &udest);
@@ -3167,8 +3177,6 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 	}
 
 out:
-	if (svc)
-		ip_vs_service_put(svc);
 	mutex_unlock(&__ip_vs_mutex);
 
 	return ret;
@@ -3214,7 +3222,6 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
 			goto out_err;
 		} else if (svc) {
 			ret = ip_vs_genl_fill_service(msg, svc);
-			ip_vs_service_put(svc);
 			if (ret)
 				goto nla_put_failure;
 		} else {
@@ -3383,6 +3390,16 @@ int __init ip_vs_control_init(void)
 
 	EnterFunction(2);
 
+	/* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
+		INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
+		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+	}
+	for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
+		INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+	}
+	smp_wmb();
+
 	ret = nf_register_sockopt(&ip_vs_sockopts);
 	if (ret) {
 		pr_err("cannot register sockopt.\n");
@@ -3401,15 +3418,6 @@ int __init ip_vs_control_init(void)
 
 	sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
 
-	/* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
-	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
-		INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
-		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
-	}
-	for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
-		INIT_LIST_HEAD(&ip_vs_rtable[idx]);
-	}
-
 	ip_vs_new_estimator(&ip_vs_stats);
 
 	/* Hook the defense timer */
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 2ae747a376a..75455000ad1 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -32,6 +32,10 @@
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
 #include <linux/gfp.h>
 #include <net/protocol.h>
 #include <net/tcp.h>
@@ -60,6 +64,8 @@ static int ip_vs_ftp_pasv;
 static int
 ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
 {
+	/* We use connection tracking for the command connection */
+	cp->flags |= IP_VS_CONN_F_NFCT;
 	return 0;
 }
 
@@ -123,7 +129,6 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
 	return 1;
 }
 
-
 /*
  * Look at outgoing ftp packets to catch the response to a PASV command
  * from the server (inside-to-outside).
@@ -149,7 +154,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	struct ip_vs_conn *n_cp;
 	char buf[24];		/* xxx.xxx.xxx.xxx,ppp,ppp\000 */
 	unsigned buf_len;
-	int ret;
+	int ret = 0;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
 
 #ifdef CONFIG_IP_VS_IPV6
 	/* This application helper doesn't work with IPv6 yet,
@@ -188,14 +195,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		/*
 		 * Now update or create an connection entry for it
 		 */
-		n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
-					  &cp->caddr, 0);
+		{
+			struct ip_vs_conn_param p;
+			ip_vs_conn_fill_param(AF_INET, iph->protocol,
+					      &from, port, &cp->caddr, 0, &p);
+			n_cp = ip_vs_conn_out_get(&p);
+		}
 		if (!n_cp) {
-			n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
-					      &cp->caddr, 0,
-					      &cp->vaddr, port,
-					      &from, port,
-					      IP_VS_CONN_F_NO_CPORT,
+			struct ip_vs_conn_param p;
+			ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr,
+					      0, &cp->vaddr, port, &p);
+			n_cp = ip_vs_conn_new(&p, &from, port,
+					      IP_VS_CONN_F_NO_CPORT |
+					      IP_VS_CONN_F_NFCT,
 					      cp->dest);
 			if (!n_cp)
 				return 0;
@@ -219,19 +231,31 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 
 		buf_len = strlen(buf);
 
+		ct = nf_ct_get(skb, &ctinfo);
+		if (ct && !nf_ct_is_untracked(ct) && nfct_nat(ct)) {
+			/* If mangling fails this function will return 0
+			 * which will cause the packet to be dropped.
+			 * Mangling can only fail under memory pressure,
+			 * hopefully it will succeed on the retransmitted
+			 * packet.
+			 */
+			ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+						       start-data, end-start,
+						       buf, buf_len);
+			if (ret) {
+				ip_vs_nfct_expect_related(skb, ct, n_cp,
+							  IPPROTO_TCP, 0, 0);
+				if (skb->ip_summed == CHECKSUM_COMPLETE)
+					skb->ip_summed = CHECKSUM_UNNECESSARY;
+				/* csum is updated */
+				ret = 1;
+			}
+		}
+
 		/*
-		 * Calculate required delta-offset to keep TCP happy
+		 * Not setting 'diff' is intentional, otherwise the sequence
+		 * would be adjusted twice.
 		 */
-		*diff = buf_len - (end-start);
-
-		if (*diff == 0) {
-			/* simply replace it with new passive address */
-			memcpy(start, buf, buf_len);
-			ret = 1;
-		} else {
-			ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start,
-					  end-start, buf, buf_len);
-		}
 
 		cp->app_data = NULL;
 		ip_vs_tcp_conn_listen(n_cp);
@@ -332,21 +356,22 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		  ip_vs_proto_name(iph->protocol),
 		  &to.ip, ntohs(port), &cp->vaddr.ip, 0);
 
-	n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
-				 &to, port,
-				 &cp->vaddr, htons(ntohs(cp->vport)-1));
-	if (!n_cp) {
-		n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
-				      &to, port,
+	{
+		struct ip_vs_conn_param p;
+		ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port,
 				      &cp->vaddr, htons(ntohs(cp->vport)-1),
-				      &cp->daddr, htons(ntohs(cp->dport)-1),
-				      0,
-				      cp->dest);
-		if (!n_cp)
-			return 0;
+				      &p);
+		n_cp = ip_vs_conn_in_get(&p);
+		if (!n_cp) {
+			n_cp = ip_vs_conn_new(&p, &cp->daddr,
+					      htons(ntohs(cp->dport)-1),
+					      IP_VS_CONN_F_NFCT, cp->dest);
+			if (!n_cp)
+				return 0;
 
-		/* add its controller */
-		ip_vs_control_add(n_cp, cp);
+			/* add its controller */
+			ip_vs_control_add(n_cp, cp);
+		}
 	}
 
 	/*
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 94a45213faa..9323f894419 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -11,7 +11,7 @@
  * Changes:
  *     Martin Hamilton         :    fixed the terrible locking bugs
  *                                   *lock(tbl->lock) ==> *lock(&tbl->lock)
- *     Wensong Zhang           :    fixed the uninitilized tbl->lock bug
+ *     Wensong Zhang           :    fixed the uninitialized tbl->lock bug
  *     Wensong Zhang           :    added doing full expiration check to
  *                                   collect stale entries of 24+ hours when
  *                                   no partial expire check in a half hour
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 535dc2b419d..dbeed8ea421 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -386,7 +386,7 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
 		ip_vs_addr_copy(dest->af, &en->addr, daddr);
 		en->lastuse = jiffies;
 
-		/* initilize its dest set */
+		/* initialize its dest set */
 		atomic_set(&(en->set.size), 0);
 		INIT_LIST_HEAD(&en->set.list);
 		rwlock_init(&en->set.lock);
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
new file mode 100644
index 00000000000..4680647cd45
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -0,0 +1,292 @@
+/*
+ * ip_vs_nfct.c:	Netfilter connection tracking support for IPVS
+ *
+ * Portions Copyright (C) 2001-2002
+ * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
+ *
+ * Portions Copyright (C) 2003-2010
+ * Julian Anastasov
+ *
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ * Authors:
+ * Ben North <ben@redfrontdoor.org>
+ * Julian Anastasov <ja@ssi.bg>		Reorganize and sync with latest kernels
+ * Hannes Eder <heder@google.com>	Extend NFCT support for FTP, ipvs match
+ *
+ *
+ * Current status:
+ *
+ * - provide conntrack confirmation for new and related connections, by
+ * this way we can see their proper conntrack state in all hooks
+ * - support for all forwarding methods, not only NAT
+ * - FTP support (NAT), ability to support other NAT apps with expectations
+ * - to correctly create expectations for related NAT connections the proper
+ * NF conntrack support must be already installed, eg. ip_vs_ftp requires
+ * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables
+ * NAT rules are needed)
+ * - alter reply for NAT when forwarding packet in original direction:
+ * conntrack from client in NEW or RELATED (Passive FTP DATA) state or
+ * when RELATED conntrack is created from real server (Active FTP DATA)
+ * - if iptables_nat is not loaded the Passive FTP will not work (the
+ * PASV response can not be NAT-ed) but Active FTP should work
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+
+#define FMT_TUPLE	"%pI4:%u->%pI4:%u/%u"
+#define ARG_TUPLE(T)	&(T)->src.u3.ip, ntohs((T)->src.u.all), \
+			&(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+			(T)->dst.protonum
+
+#define FMT_CONN	"%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
+#define ARG_CONN(C)	&((C)->caddr.ip), ntohs((C)->cport), \
+			&((C)->vaddr.ip), ntohs((C)->vport), \
+			&((C)->daddr.ip), ntohs((C)->dport), \
+			(C)->protocol, (C)->state
+
+void
+ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conntrack_tuple new_tuple;
+
+	if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) ||
+	    nf_ct_is_dying(ct))
+		return;
+
+	/* Never alter conntrack for non-NAT conns */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+		return;
+
+	/* Alter reply only in original direction */
+	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+		return;
+
+	/*
+	 * The connection is not yet in the hashtable, so we update it.
+	 * CIP->VIP will remain the same, so leave the tuple in
+	 * IP_CT_DIR_ORIGINAL untouched.  When the reply comes back from the
+	 * real-server we will see RIP->DIP.
+	 */
+	new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+	/*
+	 * This will also take care of UDP and other protocols.
+	 */
+	if (outin) {
+		new_tuple.src.u3 = cp->daddr;
+		if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+		    new_tuple.dst.protonum != IPPROTO_ICMPV6)
+			new_tuple.src.u.tcp.port = cp->dport;
+	} else {
+		new_tuple.dst.u3 = cp->vaddr;
+		if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+		    new_tuple.dst.protonum != IPPROTO_ICMPV6)
+			new_tuple.dst.u.tcp.port = cp->vport;
+	}
+	IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+		  "ctinfo=%d, old reply=" FMT_TUPLE
+		  ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
+		  __func__, ct, ct->status, ctinfo,
+		  ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
+		  ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+	nf_conntrack_alter_reply(ct, &new_tuple);
+}
+
+int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+	return nf_conntrack_confirm(skb);
+}
+
+/*
+ * Called from init_conntrack() as expectfn handler.
+ */
+static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
+	struct nf_conntrack_expect *exp)
+{
+	struct nf_conntrack_tuple *orig, new_reply;
+	struct ip_vs_conn *cp;
+	struct ip_vs_conn_param p;
+
+	if (exp->tuple.src.l3num != PF_INET)
+		return;
+
+	/*
+	 * We assume that no NF locks are held before this callback.
+	 * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
+	 * expectations even if they use wildcard values, now we provide the
+	 * actual values from the newly created original conntrack direction.
+	 * The conntrack is confirmed when packet reaches IPVS hooks.
+	 */
+
+	/* RS->CLIENT */
+	orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+	ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum,
+			      &orig->src.u3, orig->src.u.tcp.port,
+			      &orig->dst.u3, orig->dst.u.tcp.port, &p);
+	cp = ip_vs_conn_out_get(&p);
+	if (cp) {
+		/* Change reply CLIENT->RS to CLIENT->VS */
+		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+		IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
+			  __func__, ct, ct->status,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		new_reply.dst.u3 = cp->vaddr;
+		new_reply.dst.u.tcp.port = cp->vport;
+		IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
+			  ", inout cp=" FMT_CONN "\n",
+			  __func__, ct,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		goto alter;
+	}
+
+	/* CLIENT->VS */
+	cp = ip_vs_conn_in_get(&p);
+	if (cp) {
+		/* Change reply VS->CLIENT to RS->CLIENT */
+		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+		IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
+			  __func__, ct, ct->status,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		new_reply.src.u3 = cp->daddr;
+		new_reply.src.u.tcp.port = cp->dport;
+		IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
+			  FMT_TUPLE ", outin cp=" FMT_CONN "\n",
+			  __func__, ct,
+			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+			  ARG_CONN(cp));
+		goto alter;
+	}
+
+	IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+		  " - unknown expect\n",
+		  __func__, ct, ct->status, ARG_TUPLE(orig));
+	return;
+
+alter:
+	/* Never alter conntrack for non-NAT conns */
+	if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
+		nf_conntrack_alter_reply(ct, &new_reply);
+	ip_vs_conn_put(cp);
+	return;
+}
+
+/*
+ * Create NF conntrack expectation with wildcard (optional) source port.
+ * Then the default callback function will alter the reply and will confirm
+ * the conntrack entry when the first packet comes.
+ * Use port 0 to expect connection from any port.
+ */
+void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+			       struct ip_vs_conn *cp, u_int8_t proto,
+			       const __be16 port, int from_rs)
+{
+	struct nf_conntrack_expect *exp;
+
+	if (ct == NULL || nf_ct_is_untracked(ct))
+		return;
+
+	exp = nf_ct_expect_alloc(ct);
+	if (!exp)
+		return;
+
+	nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+			from_rs ? &cp->daddr : &cp->caddr,
+			from_rs ? &cp->caddr : &cp->vaddr,
+			proto, port ? &port : NULL,
+			from_rs ? &cp->cport : &cp->vport);
+
+	exp->expectfn = ip_vs_nfct_expect_callback;
+
+	IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+		__func__, ct, ARG_TUPLE(&exp->tuple));
+	nf_ct_expect_related(exp);
+	nf_ct_expect_put(exp);
+}
+EXPORT_SYMBOL(ip_vs_nfct_expect_related);
+
+/*
+ * Our connection was terminated, try to drop the conntrack immediately
+ */
+void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
+{
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
+	struct nf_conntrack_tuple tuple;
+
+	if (!cp->cport)
+		return;
+
+	tuple = (struct nf_conntrack_tuple) {
+		.dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } };
+	tuple.src.u3 = cp->caddr;
+	tuple.src.u.all = cp->cport;
+	tuple.src.l3num = cp->af;
+	tuple.dst.u3 = cp->vaddr;
+	tuple.dst.u.all = cp->vport;
+
+	IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
+		" for conn " FMT_CONN "\n",
+		__func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+
+	h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
+	if (h) {
+		ct = nf_ct_tuplehash_to_ctrack(h);
+		/* Show what happens instead of calling nf_ct_kill() */
+		if (del_timer(&ct->timeout)) {
+			IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple="
+				FMT_TUPLE "\n",
+				__func__, ct, ARG_TUPLE(&tuple));
+			if (ct->timeout.function)
+				ct->timeout.function(ct->timeout.data);
+		} else {
+			IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
+				FMT_TUPLE "\n",
+				__func__, ct, ARG_TUPLE(&tuple));
+		}
+		nf_ct_put(ct);
+	} else {
+		IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+			__func__, ARG_TUPLE(&tuple));
+	}
+}
+
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
new file mode 100644
index 00000000000..3414af70ee1
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -0,0 +1,147 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+/* IPVS pe list */
+static LIST_HEAD(ip_vs_pe);
+
+/* lock for service table */
+static DEFINE_SPINLOCK(ip_vs_pe_lock);
+
+/* Bind a service with a pe */
+void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe)
+{
+	svc->pe = pe;
+}
+
+/* Unbind a service from its pe */
+void ip_vs_unbind_pe(struct ip_vs_service *svc)
+{
+	svc->pe = NULL;
+}
+
+/* Get pe in the pe list by name */
+static struct ip_vs_pe *
+ip_vs_pe_getbyname(const char *pe_name)
+{
+	struct ip_vs_pe *pe;
+
+	IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__,
+		  pe_name);
+
+	spin_lock_bh(&ip_vs_pe_lock);
+
+	list_for_each_entry(pe, &ip_vs_pe, n_list) {
+		/* Test and get the modules atomically */
+		if (pe->module &&
+		    !try_module_get(pe->module)) {
+			/* This pe is just deleted */
+			continue;
+		}
+		if (strcmp(pe_name, pe->name)==0) {
+			/* HIT */
+			spin_unlock_bh(&ip_vs_pe_lock);
+			return pe;
+		}
+		if (pe->module)
+			module_put(pe->module);
+	}
+
+	spin_unlock_bh(&ip_vs_pe_lock);
+	return NULL;
+}
+
+/* Lookup pe and try to load it if it doesn't exist */
+struct ip_vs_pe *ip_vs_pe_get(const char *name)
+{
+	struct ip_vs_pe *pe;
+
+	/* Search for the pe by name */
+	pe = ip_vs_pe_getbyname(name);
+
+	/* If pe not found, load the module and search again */
+	if (!pe) {
+		request_module("ip_vs_pe_%s", name);
+		pe = ip_vs_pe_getbyname(name);
+	}
+
+	return pe;
+}
+
+void ip_vs_pe_put(struct ip_vs_pe *pe)
+{
+	if (pe && pe->module)
+		module_put(pe->module);
+}
+
+/* Register a pe in the pe list */
+int register_ip_vs_pe(struct ip_vs_pe *pe)
+{
+	struct ip_vs_pe *tmp;
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	spin_lock_bh(&ip_vs_pe_lock);
+
+	if (!list_empty(&pe->n_list)) {
+		spin_unlock_bh(&ip_vs_pe_lock);
+		ip_vs_use_count_dec();
+		pr_err("%s(): [%s] pe already linked\n",
+		       __func__, pe->name);
+		return -EINVAL;
+	}
+
+	/* Make sure that the pe with this name doesn't exist
+	 * in the pe list.
+	 */
+	list_for_each_entry(tmp, &ip_vs_pe, n_list) {
+		if (strcmp(tmp->name, pe->name) == 0) {
+			spin_unlock_bh(&ip_vs_pe_lock);
+			ip_vs_use_count_dec();
+			pr_err("%s(): [%s] pe already existed "
+			       "in the system\n", __func__, pe->name);
+			return -EINVAL;
+		}
+	}
+	/* Add it into the d-linked pe list */
+	list_add(&pe->n_list, &ip_vs_pe);
+	spin_unlock_bh(&ip_vs_pe_lock);
+
+	pr_info("[%s] pe registered.\n", pe->name);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_ip_vs_pe);
+
+/* Unregister a pe from the pe list */
+int unregister_ip_vs_pe(struct ip_vs_pe *pe)
+{
+	spin_lock_bh(&ip_vs_pe_lock);
+	if (list_empty(&pe->n_list)) {
+		spin_unlock_bh(&ip_vs_pe_lock);
+		pr_err("%s(): [%s] pe is not in the list. failed\n",
+		       __func__, pe->name);
+		return -EINVAL;
+	}
+
+	/* Remove it from the d-linked pe list */
+	list_del(&pe->n_list);
+	spin_unlock_bh(&ip_vs_pe_lock);
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	pr_info("[%s] pe unregistered.\n", pe->name);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ip_vs_pe);
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
new file mode 100644
index 00000000000..b8b4e9620f3
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -0,0 +1,169 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+#ifdef CONFIG_IP_VS_DEBUG
+static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
+				    const char *callid, size_t callid_len,
+				    int *idx)
+{
+	size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1);
+	memcpy(buf + *idx, callid, len);
+	buf[*idx+len] = '\0';
+	*idx += len + 1;
+	return buf + *idx - len;
+}
+
+#define IP_VS_DEBUG_CALLID(callid, len)					\
+	ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf),		\
+			 callid, len, &ip_vs_dbg_idx)
+#endif
+
+static int get_callid(const char *dptr, unsigned int dataoff,
+		      unsigned int datalen,
+		      unsigned int *matchoff, unsigned int *matchlen)
+{
+	/* Find callid */
+	while (1) {
+		int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen,
+					    SIP_HDR_CALL_ID, matchoff,
+					    matchlen);
+		if (ret > 0)
+			break;
+		if (!ret)
+			return 0;
+		dataoff += *matchoff;
+	}
+
+	/* Empty callid is useless */
+	if (!*matchlen)
+		return -EINVAL;
+
+	/* Too large is useless */
+	if (*matchlen > IP_VS_PEDATA_MAXLEN)
+		return -EINVAL;
+
+	/* SIP headers are always followed by a line terminator */
+	if (*matchoff + *matchlen == datalen)
+		return -EINVAL;
+
+	/* RFC 2543 allows lines to be terminated with CR, LF or CRLF,
+	 * RFC 3261 allows only CRLF, we support both. */
+	if (*(dptr + *matchoff + *matchlen) != '\r' &&
+	    *(dptr + *matchoff + *matchlen) != '\n')
+		return -EINVAL;
+
+	IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n",
+		      IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen),
+		      *matchlen);
+	return 0;
+}
+
+static int
+ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
+{
+	struct ip_vs_iphdr iph;
+	unsigned int dataoff, datalen, matchoff, matchlen;
+	const char *dptr;
+
+	ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
+
+	/* Only useful with UDP */
+	if (iph.protocol != IPPROTO_UDP)
+		return -EINVAL;
+
+	/* No Data ? */
+	dataoff = iph.len + sizeof(struct udphdr);
+	if (dataoff >= skb->len)
+		return -EINVAL;
+
+	dptr = skb->data + dataoff;
+	datalen = skb->len - dataoff;
+
+	if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
+		return -EINVAL;
+
+	p->pe_data = kmalloc(matchlen, GFP_ATOMIC);
+	if (!p->pe_data)
+		return -ENOMEM;
+
+	/* N.B: pe_data is only set on success,
+	 * this allows fallback to the default persistence logic on failure
+	 */
+	memcpy(p->pe_data, dptr + matchoff, matchlen);
+	p->pe_data_len = matchlen;
+
+	return 0;
+}
+
+static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
+				  struct ip_vs_conn *ct)
+
+{
+	bool ret = 0;
+
+	if (ct->af == p->af &&
+	    ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) &&
+	    /* protocol should only be IPPROTO_IP if
+	     * d_addr is a fwmark */
+	    ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+			     p->vaddr, &ct->vaddr) &&
+	    ct->vport == p->vport &&
+	    ct->flags & IP_VS_CONN_F_TEMPLATE &&
+	    ct->protocol == p->protocol &&
+	    ct->pe_data && ct->pe_data_len == p->pe_data_len &&
+	    !memcmp(ct->pe_data, p->pe_data, p->pe_data_len))
+		ret = 1;
+
+	IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n",
+		      ip_vs_proto_name(p->protocol),
+		      IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len),
+		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+		      ret ? "hit" : "not hit");
+
+	return ret;
+}
+
+static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
+				 u32 initval, bool inverse)
+{
+	return jhash(p->pe_data, p->pe_data_len, initval);
+}
+
+static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
+{
+	memcpy(buf, cp->pe_data, cp->pe_data_len);
+	return cp->pe_data_len;
+}
+
+static struct ip_vs_pe ip_vs_sip_pe =
+{
+	.name =			"sip",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list =		LIST_HEAD_INIT(ip_vs_sip_pe.n_list),
+	.fill_param =		ip_vs_sip_fill_param,
+	.ct_match =		ip_vs_sip_ct_match,
+	.hashkey_raw =		ip_vs_sip_hashkey_raw,
+	.show_pe_data =		ip_vs_sip_show_pe_data,
+};
+
+static int __init ip_vs_sip_init(void)
+{
+	return register_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+static void __exit ip_vs_sip_cleanup(void)
+{
+	unregister_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+module_init(ip_vs_sip_init);
+module_exit(ip_vs_sip_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 2d3d5e4b35f..c5399839087 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -98,6 +98,7 @@ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
 
 	return NULL;
 }
+EXPORT_SYMBOL(ip_vs_proto_get);
 
 
 /*
@@ -171,8 +172,8 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
 	else if (ih->frag_off & htons(IP_OFFSET))
 		sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr);
 	else {
-		__be16 _ports[2], *pptr
-;
+		__be16 _ports[2], *pptr;
+
 		pptr = skb_header_pointer(skb, offset + ih->ihl*4,
 					  sizeof(_ports), _ports);
 		if (pptr == NULL)
@@ -222,13 +223,13 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
 
 
 void
-ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
+ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
 			  const struct sk_buff *skb,
 			  int offset,
 			  const char *msg)
 {
 #ifdef CONFIG_IP_VS_IPV6
-	if (skb->protocol == htons(ETH_P_IPV6))
+	if (af == AF_INET6)
 		ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
 	else
 #endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 1892dfc12fd..3a0461117d3 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -40,6 +40,19 @@ struct isakmp_hdr {
 
 #define PORT_ISAKMP	500
 
+static void
+ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph,
+			     int inverse, struct ip_vs_conn_param *p)
+{
+	if (likely(!inverse))
+		ip_vs_conn_fill_param(af, IPPROTO_UDP,
+				      &iph->saddr, htons(PORT_ISAKMP),
+				      &iph->daddr, htons(PORT_ISAKMP), p);
+	else
+		ip_vs_conn_fill_param(af, IPPROTO_UDP,
+				      &iph->daddr, htons(PORT_ISAKMP),
+				      &iph->saddr, htons(PORT_ISAKMP), p);
+}
 
 static struct ip_vs_conn *
 ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
@@ -47,21 +60,10 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
 		   int inverse)
 {
 	struct ip_vs_conn *cp;
+	struct ip_vs_conn_param p;
 
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
-				       &iph->saddr,
-				       htons(PORT_ISAKMP),
-				       &iph->daddr,
-				       htons(PORT_ISAKMP));
-	} else {
-		cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
-				       &iph->daddr,
-				       htons(PORT_ISAKMP),
-				       &iph->saddr,
-				       htons(PORT_ISAKMP));
-	}
-
+	ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+	cp = ip_vs_conn_in_get(&p);
 	if (!cp) {
 		/*
 		 * We are not sure if the packet is from our
@@ -87,21 +89,10 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
 		    int inverse)
 {
 	struct ip_vs_conn *cp;
+	struct ip_vs_conn_param p;
 
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
-					&iph->saddr,
-					htons(PORT_ISAKMP),
-					&iph->daddr,
-					htons(PORT_ISAKMP));
-	} else {
-		cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
-					&iph->daddr,
-					htons(PORT_ISAKMP),
-					&iph->saddr,
-					htons(PORT_ISAKMP));
-	}
-
+	ah_esp_conn_fill_param_proto(af, iph, inverse, &p);
+	cp = ip_vs_conn_out_get(&p);
 	if (!cp) {
 		IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
 			      "%s%s %s->%s\n",
@@ -126,54 +117,6 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	return 0;
 }
 
-
-static void
-ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-		       int offset, const char *msg)
-{
-	char buf[256];
-	struct iphdr _iph, *ih;
-
-	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-	if (ih == NULL)
-		sprintf(buf, "TRUNCATED");
-	else
-		sprintf(buf, "%pI4->%pI4", &ih->saddr, &ih->daddr);
-
-	pr_debug("%s: %s %s\n", msg, pp->name, buf);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-static void
-ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-		       int offset, const char *msg)
-{
-	char buf[256];
-	struct ipv6hdr _iph, *ih;
-
-	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-	if (ih == NULL)
-		sprintf(buf, "TRUNCATED");
-	else
-		sprintf(buf, "%pI6->%pI6", &ih->saddr, &ih->daddr);
-
-	pr_debug("%s: %s %s\n", msg, pp->name, buf);
-}
-#endif
-
-static void
-ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
-		    int offset, const char *msg)
-{
-#ifdef CONFIG_IP_VS_IPV6
-	if (skb->protocol == htons(ETH_P_IPV6))
-		ah_esp_debug_packet_v6(pp, skb, offset, msg);
-	else
-#endif
-		ah_esp_debug_packet_v4(pp, skb, offset, msg);
-}
-
-
 static void ah_esp_init(struct ip_vs_protocol *pp)
 {
 	/* nothing to do now */
@@ -204,7 +147,7 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
 	.register_app =		NULL,
 	.unregister_app =	NULL,
 	.app_conn_bind =	NULL,
-	.debug_packet =		ah_esp_debug_packet,
+	.debug_packet =		ip_vs_tcpudp_debug_packet,
 	.timeout_change =	NULL,		/* ISAKMP */
 	.set_state_timeout =	NULL,
 };
@@ -228,7 +171,7 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
 	.register_app =		NULL,
 	.unregister_app =	NULL,
 	.app_conn_bind =	NULL,
-	.debug_packet =		ah_esp_debug_packet,
+	.debug_packet =		ip_vs_tcpudp_debug_packet,
 	.timeout_change =	NULL,		/* ISAKMP */
 };
 #endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index c9a3f7a21d5..1ea96bcd342 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -8,55 +8,6 @@
 #include <net/sctp/checksum.h>
 #include <net/ip_vs.h>
 
-
-static struct ip_vs_conn *
-sctp_conn_in_get(int af,
-		 const struct sk_buff *skb,
-		 struct ip_vs_protocol *pp,
-		 const struct ip_vs_iphdr *iph,
-		 unsigned int proto_off,
-		 int inverse)
-{
-	__be16 _ports[2], *pptr;
-
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-	if (pptr == NULL)
-		return NULL;
-
-	if (likely(!inverse)) 
-		return ip_vs_conn_in_get(af, iph->protocol,
-					 &iph->saddr, pptr[0],
-					 &iph->daddr, pptr[1]);
-	else 
-		return ip_vs_conn_in_get(af, iph->protocol,
-					 &iph->daddr, pptr[1],
-					 &iph->saddr, pptr[0]);
-}
-
-static struct ip_vs_conn *
-sctp_conn_out_get(int af,
-		  const struct sk_buff *skb,
-		  struct ip_vs_protocol *pp,
-		  const struct ip_vs_iphdr *iph,
-		  unsigned int proto_off,
-		  int inverse)
-{
-	__be16 _ports[2], *pptr;
-
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-	if (pptr == NULL)
-		return NULL;
-
-	if (likely(!inverse)) 
-		return ip_vs_conn_out_get(af, iph->protocol,
-					  &iph->saddr, pptr[0],
-					  &iph->daddr, pptr[1]);
-	else 
-		return ip_vs_conn_out_get(af, iph->protocol,
-					  &iph->daddr, pptr[1],
-					  &iph->saddr, pptr[0]);
-}
-
 static int
 sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		   int *verdict, struct ip_vs_conn **cpp)
@@ -80,6 +31,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	if ((sch->type == SCTP_CID_INIT) &&
 	    (svc = ip_vs_service_get(af, skb->mark, iph.protocol,
 				     &iph.daddr, sh->dest))) {
+		int ignored;
+
 		if (ip_vs_todrop()) {
 			/*
 			 * It seems that we are very loaded.
@@ -93,8 +46,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * Let the virtual server select a real server for the
 		 * incoming connection, and create a connection entry.
 		 */
-		*cpp = ip_vs_schedule(svc, skb);
-		if (!*cpp) {
+		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+		if (!*cpp && !ignored) {
 			*verdict = ip_vs_leave(svc, skb, pp);
 			return 0;
 		}
@@ -110,6 +63,7 @@ sctp_snat_handler(struct sk_buff *skb,
 {
 	sctp_sctphdr_t *sctph;
 	unsigned int sctphoff;
+	struct sk_buff *iter;
 	__be32 crc32;
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -138,8 +92,8 @@ sctp_snat_handler(struct sk_buff *skb,
 
 	/* Calculate the checksum */
 	crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff);
-	for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next)
-		crc32 = sctp_update_cksum((u8 *) skb->data, skb_headlen(skb),
+	skb_walk_frags(skb, iter)
+		crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter),
 				          crc32);
 	crc32 = sctp_end_cksum(crc32);
 	sctph->checksum = crc32;
@@ -151,9 +105,9 @@ static int
 sctp_dnat_handler(struct sk_buff *skb,
 		  struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
 {
-
 	sctp_sctphdr_t *sctph;
 	unsigned int sctphoff;
+	struct sk_buff *iter;
 	__be32 crc32;
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -173,7 +127,7 @@ sctp_dnat_handler(struct sk_buff *skb,
 			return 0;
 
 		/* Call application helper if needed */
-		if (!ip_vs_app_pkt_out(cp, skb))
+		if (!ip_vs_app_pkt_in(cp, skb))
 			return 0;
 	}
 
@@ -182,8 +136,8 @@ sctp_dnat_handler(struct sk_buff *skb,
 
 	/* Calculate the checksum */
 	crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff);
-	for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next)
-		crc32 = sctp_update_cksum((u8 *) skb->data, skb_headlen(skb),
+	skb_walk_frags(skb, iter)
+		crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter),
 					  crc32);
 	crc32 = sctp_end_cksum(crc32);
 	sctph->checksum = crc32;
@@ -194,9 +148,9 @@ sctp_dnat_handler(struct sk_buff *skb,
 static int
 sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 {
-	struct sk_buff *list = skb_shinfo(skb)->frag_list;
 	unsigned int sctphoff;
 	struct sctphdr *sh, _sctph;
+	struct sk_buff *iter;
 	__le32 cmp;
 	__le32 val;
 	__u32 tmp;
@@ -215,15 +169,15 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 	cmp = sh->checksum;
 
 	tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb));
-	for (; list; list = list->next)
-		tmp = sctp_update_cksum((__u8 *) list->data,
-					skb_headlen(list), tmp);
+	skb_walk_frags(skb, iter)
+		tmp = sctp_update_cksum((__u8 *) iter->data,
+					skb_headlen(iter), tmp);
 
 	val = sctp_end_cksum(tmp);
 
 	if (val != cmp) {
 		/* CRC failure, dump it. */
-		IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+		IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 				"Failed checksum for");
 		return 0;
 	}
@@ -1169,8 +1123,8 @@ struct ip_vs_protocol ip_vs_protocol_sctp = {
 	.register_app = sctp_register_app,
 	.unregister_app = sctp_unregister_app,
 	.conn_schedule = sctp_conn_schedule,
-	.conn_in_get = sctp_conn_in_get,
-	.conn_out_get = sctp_conn_out_get,
+	.conn_in_get = ip_vs_conn_in_get_proto,
+	.conn_out_get = ip_vs_conn_out_get_proto,
 	.snat_handler = sctp_snat_handler,
 	.dnat_handler = sctp_dnat_handler,
 	.csum_check = sctp_csum_check,
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 91d28e07374..f6c5200e214 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -27,52 +27,6 @@
 
 #include <net/ip_vs.h>
 
-
-static struct ip_vs_conn *
-tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		const struct ip_vs_iphdr *iph, unsigned int proto_off,
-		int inverse)
-{
-	__be16 _ports[2], *pptr;
-
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-	if (pptr == NULL)
-		return NULL;
-
-	if (likely(!inverse)) {
-		return ip_vs_conn_in_get(af, iph->protocol,
-					 &iph->saddr, pptr[0],
-					 &iph->daddr, pptr[1]);
-	} else {
-		return ip_vs_conn_in_get(af, iph->protocol,
-					 &iph->daddr, pptr[1],
-					 &iph->saddr, pptr[0]);
-	}
-}
-
-static struct ip_vs_conn *
-tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		 const struct ip_vs_iphdr *iph, unsigned int proto_off,
-		 int inverse)
-{
-	__be16 _ports[2], *pptr;
-
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-	if (pptr == NULL)
-		return NULL;
-
-	if (likely(!inverse)) {
-		return ip_vs_conn_out_get(af, iph->protocol,
-					  &iph->saddr, pptr[0],
-					  &iph->daddr, pptr[1]);
-	} else {
-		return ip_vs_conn_out_get(af, iph->protocol,
-					  &iph->daddr, pptr[1],
-					  &iph->saddr, pptr[0]);
-	}
-}
-
-
 static int
 tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		  int *verdict, struct ip_vs_conn **cpp)
@@ -89,9 +43,12 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		return 0;
 	}
 
+	/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
 	if (th->syn &&
 	    (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
 				     th->dest))) {
+		int ignored;
+
 		if (ip_vs_todrop()) {
 			/*
 			 * It seems that we are very loaded.
@@ -106,8 +63,8 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * Let the virtual server select a real server for the
 		 * incoming connection, and create a connection entry.
 		 */
-		*cpp = ip_vs_schedule(svc, skb);
-		if (!*cpp) {
+		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+		if (!*cpp && !ignored) {
 			*verdict = ip_vs_leave(svc, skb, pp);
 			return 0;
 		}
@@ -147,15 +104,15 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph,
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6)
 		tcph->check =
-			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+			~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
 					 ip_vs_check_diff2(oldlen, newlen,
-						~csum_unfold(tcph->check))));
+						csum_unfold(tcph->check))));
 	else
 #endif
 	tcph->check =
-		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+		~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
 				ip_vs_check_diff2(oldlen, newlen,
-						~csum_unfold(tcph->check))));
+						csum_unfold(tcph->check))));
 }
 
 
@@ -166,6 +123,7 @@ tcp_snat_handler(struct sk_buff *skb,
 	struct tcphdr *tcph;
 	unsigned int tcphoff;
 	int oldlen;
+	int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -180,13 +138,20 @@ tcp_snat_handler(struct sk_buff *skb,
 		return 0;
 
 	if (unlikely(cp->app != NULL)) {
+		int ret;
+
 		/* Some checks before mangling */
 		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 			return 0;
 
 		/* Call application helper if needed */
-		if (!ip_vs_app_pkt_out(cp, skb))
+		if (!(ret = ip_vs_app_pkt_out(cp, skb)))
 			return 0;
+		/* ret=2: csum update is needed after payload mangling */
+		if (ret == 1)
+			oldlen = skb->len - tcphoff;
+		else
+			payload_csum = 1;
 	}
 
 	tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -197,12 +162,13 @@ tcp_snat_handler(struct sk_buff *skb,
 		tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
 					htons(oldlen),
 					htons(skb->len - tcphoff));
-	} else if (!cp->app) {
+	} else if (!payload_csum) {
 		/* Only port and addr are changed, do fast csum update */
 		tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
 				     cp->dport, cp->vport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->ip_summed = CHECKSUM_NONE;
+			skb->ip_summed = (cp->app && pp->csum_check) ?
+					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
 	} else {
 		/* full checksum calculation */
 		tcph->check = 0;
@@ -220,6 +186,7 @@ tcp_snat_handler(struct sk_buff *skb,
 							skb->len - tcphoff,
 							cp->protocol,
 							skb->csum);
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
 			  pp->name, tcph->check,
@@ -236,6 +203,7 @@ tcp_dnat_handler(struct sk_buff *skb,
 	struct tcphdr *tcph;
 	unsigned int tcphoff;
 	int oldlen;
+	int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -250,6 +218,8 @@ tcp_dnat_handler(struct sk_buff *skb,
 		return 0;
 
 	if (unlikely(cp->app != NULL)) {
+		int ret;
+
 		/* Some checks before mangling */
 		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 			return 0;
@@ -258,8 +228,13 @@ tcp_dnat_handler(struct sk_buff *skb,
 		 *	Attempt ip_vs_app call.
 		 *	It will fix ip_vs_conn and iph ack_seq stuff
 		 */
-		if (!ip_vs_app_pkt_in(cp, skb))
+		if (!(ret = ip_vs_app_pkt_in(cp, skb)))
 			return 0;
+		/* ret=2: csum update is needed after payload mangling */
+		if (ret == 1)
+			oldlen = skb->len - tcphoff;
+		else
+			payload_csum = 1;
 	}
 
 	tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -269,15 +244,16 @@ tcp_dnat_handler(struct sk_buff *skb,
 	 *	Adjust TCP checksums
 	 */
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+		tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
 					htons(oldlen),
 					htons(skb->len - tcphoff));
-	} else if (!cp->app) {
+	} else if (!payload_csum) {
 		/* Only port and addr are changed, do fast csum update */
 		tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
 				     cp->vport, cp->dport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->ip_summed = CHECKSUM_NONE;
+			skb->ip_summed = (cp->app && pp->csum_check) ?
+					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
 	} else {
 		/* full checksum calculation */
 		tcph->check = 0;
@@ -324,7 +300,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 					    skb->len - tcphoff,
 					    ipv6_hdr(skb)->nexthdr,
 					    skb->csum)) {
-				IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+				IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 						 "Failed checksum for");
 				return 0;
 			}
@@ -335,7 +311,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 					      skb->len - tcphoff,
 					      ip_hdr(skb)->protocol,
 					      skb->csum)) {
-				IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+				IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 						 "Failed checksum for");
 				return 0;
 			}
@@ -721,8 +697,8 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
 	.register_app =		tcp_register_app,
 	.unregister_app =	tcp_unregister_app,
 	.conn_schedule =	tcp_conn_schedule,
-	.conn_in_get =		tcp_conn_in_get,
-	.conn_out_get =		tcp_conn_out_get,
+	.conn_in_get =		ip_vs_conn_in_get_proto,
+	.conn_out_get =		ip_vs_conn_out_get_proto,
 	.snat_handler =		tcp_snat_handler,
 	.dnat_handler =		tcp_dnat_handler,
 	.csum_check =		tcp_csum_check,
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index e7a6885e016..9d106a06bb0 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -27,58 +27,6 @@
 #include <net/ip.h>
 #include <net/ip6_checksum.h>
 
-static struct ip_vs_conn *
-udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		const struct ip_vs_iphdr *iph, unsigned int proto_off,
-		int inverse)
-{
-	struct ip_vs_conn *cp;
-	__be16 _ports[2], *pptr;
-
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-	if (pptr == NULL)
-		return NULL;
-
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_in_get(af, iph->protocol,
-				       &iph->saddr, pptr[0],
-				       &iph->daddr, pptr[1]);
-	} else {
-		cp = ip_vs_conn_in_get(af, iph->protocol,
-				       &iph->daddr, pptr[1],
-				       &iph->saddr, pptr[0]);
-	}
-
-	return cp;
-}
-
-
-static struct ip_vs_conn *
-udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
-		 const struct ip_vs_iphdr *iph, unsigned int proto_off,
-		 int inverse)
-{
-	struct ip_vs_conn *cp;
-	__be16 _ports[2], *pptr;
-
-	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
-	if (pptr == NULL)
-		return NULL;
-
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_out_get(af, iph->protocol,
-					&iph->saddr, pptr[0],
-					&iph->daddr, pptr[1]);
-	} else {
-		cp = ip_vs_conn_out_get(af, iph->protocol,
-					&iph->daddr, pptr[1],
-					&iph->saddr, pptr[0]);
-	}
-
-	return cp;
-}
-
-
 static int
 udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		  int *verdict, struct ip_vs_conn **cpp)
@@ -98,6 +46,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	svc = ip_vs_service_get(af, skb->mark, iph.protocol,
 				&iph.daddr, uh->dest);
 	if (svc) {
+		int ignored;
+
 		if (ip_vs_todrop()) {
 			/*
 			 * It seems that we are very loaded.
@@ -112,8 +62,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * Let the virtual server select a real server for the
 		 * incoming connection, and create a connection entry.
 		 */
-		*cpp = ip_vs_schedule(svc, skb);
-		if (!*cpp) {
+		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
+		if (!*cpp && !ignored) {
 			*verdict = ip_vs_leave(svc, skb, pp);
 			return 0;
 		}
@@ -154,15 +104,15 @@ udp_partial_csum_update(int af, struct udphdr *uhdr,
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6)
 		uhdr->check =
-			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+			~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
 					 ip_vs_check_diff2(oldlen, newlen,
-						~csum_unfold(uhdr->check))));
+						csum_unfold(uhdr->check))));
 	else
 #endif
 	uhdr->check =
-		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+		~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
 				ip_vs_check_diff2(oldlen, newlen,
-						~csum_unfold(uhdr->check))));
+						csum_unfold(uhdr->check))));
 }
 
 
@@ -173,6 +123,7 @@ udp_snat_handler(struct sk_buff *skb,
 	struct udphdr *udph;
 	unsigned int udphoff;
 	int oldlen;
+	int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -187,6 +138,8 @@ udp_snat_handler(struct sk_buff *skb,
 		return 0;
 
 	if (unlikely(cp->app != NULL)) {
+		int ret;
+
 		/* Some checks before mangling */
 		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 			return 0;
@@ -194,8 +147,13 @@ udp_snat_handler(struct sk_buff *skb,
 		/*
 		 *	Call application helper if needed
 		 */
-		if (!ip_vs_app_pkt_out(cp, skb))
+		if (!(ret = ip_vs_app_pkt_out(cp, skb)))
 			return 0;
+		/* ret=2: csum update is needed after payload mangling */
+		if (ret == 1)
+			oldlen = skb->len - udphoff;
+		else
+			payload_csum = 1;
 	}
 
 	udph = (void *)skb_network_header(skb) + udphoff;
@@ -208,12 +166,13 @@ udp_snat_handler(struct sk_buff *skb,
 		udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
 					htons(oldlen),
 					htons(skb->len - udphoff));
-	} else if (!cp->app && (udph->check != 0)) {
+	} else if (!payload_csum && (udph->check != 0)) {
 		/* Only port and addr are changed, do fast csum update */
 		udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
 				     cp->dport, cp->vport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->ip_summed = CHECKSUM_NONE;
+			skb->ip_summed = (cp->app && pp->csum_check) ?
+					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
 	} else {
 		/* full checksum calculation */
 		udph->check = 0;
@@ -233,6 +192,7 @@ udp_snat_handler(struct sk_buff *skb,
 							skb->csum);
 		if (udph->check == 0)
 			udph->check = CSUM_MANGLED_0;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
 		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
 			  pp->name, udph->check,
 			  (char*)&(udph->check) - (char*)udph);
@@ -248,6 +208,7 @@ udp_dnat_handler(struct sk_buff *skb,
 	struct udphdr *udph;
 	unsigned int udphoff;
 	int oldlen;
+	int payload_csum = 0;
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (cp->af == AF_INET6)
@@ -262,6 +223,8 @@ udp_dnat_handler(struct sk_buff *skb,
 		return 0;
 
 	if (unlikely(cp->app != NULL)) {
+		int ret;
+
 		/* Some checks before mangling */
 		if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
 			return 0;
@@ -270,8 +233,13 @@ udp_dnat_handler(struct sk_buff *skb,
 		 *	Attempt ip_vs_app call.
 		 *	It will fix ip_vs_conn
 		 */
-		if (!ip_vs_app_pkt_in(cp, skb))
+		if (!(ret = ip_vs_app_pkt_in(cp, skb)))
 			return 0;
+		/* ret=2: csum update is needed after payload mangling */
+		if (ret == 1)
+			oldlen = skb->len - udphoff;
+		else
+			payload_csum = 1;
 	}
 
 	udph = (void *)skb_network_header(skb) + udphoff;
@@ -281,15 +249,16 @@ udp_dnat_handler(struct sk_buff *skb,
 	 *	Adjust UDP checksums
 	 */
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+		udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
 					htons(oldlen),
 					htons(skb->len - udphoff));
-	} else if (!cp->app && (udph->check != 0)) {
+	} else if (!payload_csum && (udph->check != 0)) {
 		/* Only port and addr are changed, do fast csum update */
 		udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
 				     cp->vport, cp->dport);
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
-			skb->ip_summed = CHECKSUM_NONE;
+			skb->ip_summed = (cp->app && pp->csum_check) ?
+					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
 	} else {
 		/* full checksum calculation */
 		udph->check = 0;
@@ -345,7 +314,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 						    skb->len - udphoff,
 						    ipv6_hdr(skb)->nexthdr,
 						    skb->csum)) {
-					IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+					IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 							 "Failed checksum for");
 					return 0;
 				}
@@ -356,7 +325,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
 						      skb->len - udphoff,
 						      ip_hdr(skb)->protocol,
 						      skb->csum)) {
-					IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+					IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
 							 "Failed checksum for");
 					return 0;
 				}
@@ -520,8 +489,8 @@ struct ip_vs_protocol ip_vs_protocol_udp = {
 	.init =			udp_init,
 	.exit =			udp_exit,
 	.conn_schedule =	udp_conn_schedule,
-	.conn_in_get =		udp_conn_in_get,
-	.conn_out_get =		udp_conn_out_get,
+	.conn_in_get =		ip_vs_conn_in_get_proto,
+	.conn_out_get =		ip_vs_conn_out_get_proto,
 	.snat_handler =		udp_snat_handler,
 	.dnat_handler =		udp_dnat_handler,
 	.csum_check =		udp_csum_check,
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index bbc1ac79595..076ebe00435 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -35,7 +35,7 @@
 static LIST_HEAD(ip_vs_schedulers);
 
 /* lock for service table */
-static DEFINE_RWLOCK(__ip_vs_sched_lock);
+static DEFINE_SPINLOCK(ip_vs_sched_lock);
 
 
 /*
@@ -46,15 +46,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
 {
 	int ret;
 
-	if (svc == NULL) {
-		pr_err("%s(): svc arg NULL\n", __func__);
-		return -EINVAL;
-	}
-	if (scheduler == NULL) {
-		pr_err("%s(): scheduler arg NULL\n", __func__);
-		return -EINVAL;
-	}
-
 	svc->scheduler = scheduler;
 
 	if (scheduler->init_service) {
@@ -74,18 +65,10 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
  */
 int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
 {
-	struct ip_vs_scheduler *sched;
+	struct ip_vs_scheduler *sched = svc->scheduler;
 
-	if (svc == NULL) {
-		pr_err("%s(): svc arg NULL\n", __func__);
-		return -EINVAL;
-	}
-
-	sched = svc->scheduler;
-	if (sched == NULL) {
-		pr_err("%s(): svc isn't bound\n", __func__);
-		return -EINVAL;
-	}
+	if (!sched)
+		return 0;
 
 	if (sched->done_service) {
 		if (sched->done_service(svc) != 0) {
@@ -108,7 +91,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
 
 	IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name);
 
-	read_lock_bh(&__ip_vs_sched_lock);
+	spin_lock_bh(&ip_vs_sched_lock);
 
 	list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
 		/*
@@ -122,14 +105,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
 		}
 		if (strcmp(sched_name, sched->name)==0) {
 			/* HIT */
-			read_unlock_bh(&__ip_vs_sched_lock);
+			spin_unlock_bh(&ip_vs_sched_lock);
 			return sched;
 		}
 		if (sched->module)
 			module_put(sched->module);
 	}
 
-	read_unlock_bh(&__ip_vs_sched_lock);
+	spin_unlock_bh(&ip_vs_sched_lock);
 	return NULL;
 }
 
@@ -159,7 +142,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
 
 void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
 {
-	if (scheduler->module)
+	if (scheduler && scheduler->module)
 		module_put(scheduler->module);
 }
 
@@ -184,10 +167,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
 	/* increase the module use count */
 	ip_vs_use_count_inc();
 
-	write_lock_bh(&__ip_vs_sched_lock);
+	spin_lock_bh(&ip_vs_sched_lock);
 
 	if (!list_empty(&scheduler->n_list)) {
-		write_unlock_bh(&__ip_vs_sched_lock);
+		spin_unlock_bh(&ip_vs_sched_lock);
 		ip_vs_use_count_dec();
 		pr_err("%s(): [%s] scheduler already linked\n",
 		       __func__, scheduler->name);
@@ -200,7 +183,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
 	 */
 	list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
 		if (strcmp(scheduler->name, sched->name) == 0) {
-			write_unlock_bh(&__ip_vs_sched_lock);
+			spin_unlock_bh(&ip_vs_sched_lock);
 			ip_vs_use_count_dec();
 			pr_err("%s(): [%s] scheduler already existed "
 			       "in the system\n", __func__, scheduler->name);
@@ -211,7 +194,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
 	 *	Add it into the d-linked scheduler list
 	 */
 	list_add(&scheduler->n_list, &ip_vs_schedulers);
-	write_unlock_bh(&__ip_vs_sched_lock);
+	spin_unlock_bh(&ip_vs_sched_lock);
 
 	pr_info("[%s] scheduler registered.\n", scheduler->name);
 
@@ -229,9 +212,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
 		return -EINVAL;
 	}
 
-	write_lock_bh(&__ip_vs_sched_lock);
+	spin_lock_bh(&ip_vs_sched_lock);
 	if (list_empty(&scheduler->n_list)) {
-		write_unlock_bh(&__ip_vs_sched_lock);
+		spin_unlock_bh(&ip_vs_sched_lock);
 		pr_err("%s(): [%s] scheduler is not in the list. failed\n",
 		       __func__, scheduler->name);
 		return -EINVAL;
@@ -241,7 +224,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
 	 *	Remove it from the d-linked scheduler list
 	 */
 	list_del(&scheduler->n_list);
-	write_unlock_bh(&__ip_vs_sched_lock);
+	spin_unlock_bh(&ip_vs_sched_lock);
 
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 7ba06939829..ab85aedea17 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -288,6 +288,16 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
 		ip_vs_sync_conn(cp->control);
 }
 
+static inline int
+ip_vs_conn_fill_param_sync(int af, int protocol,
+			   const union nf_inet_addr *caddr, __be16 cport,
+			   const union nf_inet_addr *vaddr, __be16 vport,
+			   struct ip_vs_conn_param *p)
+{
+	/* XXX: Need to take into account persistence engine */
+	ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p);
+	return 0;
+}
 
 /*
  *      Process received multicast message and create the corresponding
@@ -301,6 +311,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_dest *dest;
+	struct ip_vs_conn_param param;
 	char *p;
 	int i;
 
@@ -370,18 +381,20 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 			}
 		}
 
-		if (!(flags & IP_VS_CONN_F_TEMPLATE))
-			cp = ip_vs_conn_in_get(AF_INET, s->protocol,
-					       (union nf_inet_addr *)&s->caddr,
-					       s->cport,
-					       (union nf_inet_addr *)&s->vaddr,
-					       s->vport);
-		else
-			cp = ip_vs_ct_in_get(AF_INET, s->protocol,
-					     (union nf_inet_addr *)&s->caddr,
-					     s->cport,
-					     (union nf_inet_addr *)&s->vaddr,
-					     s->vport);
+		{
+			if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
+					      (union nf_inet_addr *)&s->caddr,
+					      s->cport,
+					      (union nf_inet_addr *)&s->vaddr,
+					      s->vport, &param)) {
+				pr_err("ip_vs_conn_fill_param_sync failed");
+				return;
+			}
+			if (!(flags & IP_VS_CONN_F_TEMPLATE))
+				cp = ip_vs_conn_in_get(&param);
+			else
+				cp = ip_vs_ct_in_get(&param);
+		}
 		if (!cp) {
 			/*
 			 * Find the appropriate destination for the connection.
@@ -406,14 +419,9 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 				else
 					flags &= ~IP_VS_CONN_F_INACTIVE;
 			}
-			cp = ip_vs_conn_new(AF_INET, s->protocol,
-					    (union nf_inet_addr *)&s->caddr,
-					    s->cport,
-					    (union nf_inet_addr *)&s->vaddr,
-					    s->vport,
+			cp = ip_vs_conn_new(&param,
 					    (union nf_inet_addr *)&s->daddr,
-					    s->dport,
-					    flags, dest);
+					    s->dport, flags, dest);
 			if (dest)
 				atomic_dec(&dest->refcnt);
 			if (!cp) {
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 93c15a107b2..de04ea39cde 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -11,6 +11,16 @@
  *
  * Changes:
  *
+ * Description of forwarding methods:
+ * - all transmitters are called from LOCAL_IN (remote clients) and
+ * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
+ * - not all connections have destination server, for example,
+ * connections in backup server when fwmark is used
+ * - bypass connections use daddr from packet
+ * LOCAL_OUT rules:
+ * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
+ * - skb->pkt_type is not set yet
+ * - the only place where we can see skb->sk != NULL
  */
 
 #define KMSG_COMPONENT "IPVS"
@@ -26,6 +36,7 @@
 #include <net/route.h>                  /* for ip_route_output */
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/addrconf.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
@@ -37,26 +48,27 @@
  *      Destination cache to speed up outgoing route lookup
  */
 static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
+		u32 dst_cookie)
 {
 	struct dst_entry *old_dst;
 
 	old_dst = dest->dst_cache;
 	dest->dst_cache = dst;
 	dest->dst_rtos = rtos;
+	dest->dst_cookie = dst_cookie;
 	dst_release(old_dst);
 }
 
 static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
 {
 	struct dst_entry *dst = dest->dst_cache;
 
 	if (!dst)
 		return NULL;
-	if ((dst->obsolete
-	     || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
-	    dst->ops->check(dst, cookie) == NULL) {
+	if ((dst->obsolete || rtos != dest->dst_rtos) &&
+	    dst->ops->check(dst, dest->dst_cookie) == NULL) {
 		dest->dst_cache = NULL;
 		dst_release(dst);
 		return NULL;
@@ -65,16 +77,24 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
 	return dst;
 }
 
+/*
+ * Get route to destination or remote server
+ * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
+ *	    &4=Allow redirect from remote daddr to local
+ */
 static struct rtable *
-__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
+		   __be32 daddr, u32 rtos, int rt_mode)
 {
+	struct net *net = dev_net(skb_dst(skb)->dev);
 	struct rtable *rt;			/* Route to the other host */
-	struct ip_vs_dest *dest = cp->dest;
+	struct rtable *ort;			/* Original route */
+	int local;
 
 	if (dest) {
 		spin_lock(&dest->dst_lock);
 		if (!(rt = (struct rtable *)
-		      __ip_vs_dst_check(dest, rtos, 0))) {
+		      __ip_vs_dst_check(dest, rtos))) {
 			struct flowi fl = {
 				.oif = 0,
 				.nl_u = {
@@ -84,16 +104,16 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
 						.tos = rtos, } },
 			};
 
-			if (ip_route_output_key(&init_net, &rt, &fl)) {
+			if (ip_route_output_key(net, &rt, &fl)) {
 				spin_unlock(&dest->dst_lock);
 				IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
 					     &dest->addr.ip);
 				return NULL;
 			}
-			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
+			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
 			IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
 				  &dest->addr.ip,
-				  atomic_read(&rt->u.dst.__refcnt), rtos);
+				  atomic_read(&rt->dst.__refcnt), rtos);
 		}
 		spin_unlock(&dest->dst_lock);
 	} else {
@@ -101,78 +121,199 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
 			.oif = 0,
 			.nl_u = {
 				.ip4_u = {
-					.daddr = cp->daddr.ip,
+					.daddr = daddr,
 					.saddr = 0,
 					.tos = rtos, } },
 		};
 
-		if (ip_route_output_key(&init_net, &rt, &fl)) {
+		if (ip_route_output_key(net, &rt, &fl)) {
 			IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
-				     &cp->daddr.ip);
+				     &daddr);
 			return NULL;
 		}
 	}
 
+	local = rt->rt_flags & RTCF_LOCAL;
+	if (!((local ? 1 : 2) & rt_mode)) {
+		IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
+			     (rt->rt_flags & RTCF_LOCAL) ?
+			     "local":"non-local", &rt->rt_dst);
+		ip_rt_put(rt);
+		return NULL;
+	}
+	if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) &&
+					 ort->rt_flags & RTCF_LOCAL)) {
+		IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
+			     "requires NAT method, dest: %pI4\n",
+			     &ip_hdr(skb)->daddr, &rt->rt_dst);
+		ip_rt_put(rt);
+		return NULL;
+	}
+	if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) {
+		IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
+			     "to non-local address, dest: %pI4\n",
+			     &ip_hdr(skb)->saddr, &rt->rt_dst);
+		ip_rt_put(rt);
+		return NULL;
+	}
+
 	return rt;
 }
 
+/* Reroute packet to local IPv4 stack after DNAT */
+static int
+__ip_vs_reroute_locally(struct sk_buff *skb)
+{
+	struct rtable *rt = skb_rtable(skb);
+	struct net_device *dev = rt->dst.dev;
+	struct net *net = dev_net(dev);
+	struct iphdr *iph = ip_hdr(skb);
+
+	if (rt->fl.iif) {
+		unsigned long orefdst = skb->_skb_refdst;
+
+		if (ip_route_input(skb, iph->daddr, iph->saddr,
+				   iph->tos, skb->dev))
+			return 0;
+		refdst_drop(orefdst);
+	} else {
+		struct flowi fl = {
+			.oif = 0,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = iph->daddr,
+					.saddr = iph->saddr,
+					.tos = RT_TOS(iph->tos),
+				}
+			},
+			.mark = skb->mark,
+		};
+		struct rtable *rt;
+
+		if (ip_route_output_key(net, &rt, &fl))
+			return 0;
+		if (!(rt->rt_flags & RTCF_LOCAL)) {
+			ip_rt_put(rt);
+			return 0;
+		}
+		/* Drop old route. */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	}
+	return 1;
+}
+
 #ifdef CONFIG_IP_VS_IPV6
+
+static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
+{
+	return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK;
+}
+
+static struct dst_entry *
+__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
+			struct in6_addr *ret_saddr, int do_xfrm)
+{
+	struct dst_entry *dst;
+	struct flowi fl = {
+		.oif = 0,
+		.nl_u = {
+			.ip6_u = {
+				.daddr = *daddr,
+			},
+		},
+	};
+
+	dst = ip6_route_output(net, NULL, &fl);
+	if (dst->error)
+		goto out_err;
+	if (!ret_saddr)
+		return dst;
+	if (ipv6_addr_any(&fl.fl6_src) &&
+	    ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+			       &fl.fl6_dst, 0, &fl.fl6_src) < 0)
+		goto out_err;
+	if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
+		goto out_err;
+	ipv6_addr_copy(ret_saddr, &fl.fl6_src);
+	return dst;
+
+out_err:
+	dst_release(dst);
+	IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
+	return NULL;
+}
+
+/*
+ * Get route to destination or remote server
+ * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest,
+ *	    &4=Allow redirect from remote daddr to local
+ */
 static struct rt6_info *
-__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
+__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
+		      struct in6_addr *daddr, struct in6_addr *ret_saddr,
+		      int do_xfrm, int rt_mode)
 {
+	struct net *net = dev_net(skb_dst(skb)->dev);
 	struct rt6_info *rt;			/* Route to the other host */
-	struct ip_vs_dest *dest = cp->dest;
+	struct rt6_info *ort;			/* Original route */
+	struct dst_entry *dst;
+	int local;
 
 	if (dest) {
 		spin_lock(&dest->dst_lock);
-		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
+		rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
 		if (!rt) {
-			struct flowi fl = {
-				.oif = 0,
-				.nl_u = {
-					.ip6_u = {
-						.daddr = dest->addr.in6,
-						.saddr = {
-							.s6_addr32 =
-								{ 0, 0, 0, 0 },
-						},
-					},
-				},
-			};
+			u32 cookie;
 
-			rt = (struct rt6_info *)ip6_route_output(&init_net,
-								 NULL, &fl);
-			if (!rt) {
+			dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
+						      &dest->dst_saddr,
+						      do_xfrm);
+			if (!dst) {
 				spin_unlock(&dest->dst_lock);
-				IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
-					     &dest->addr.in6);
 				return NULL;
 			}
-			__ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst));
-			IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
-				  &dest->addr.in6,
-				  atomic_read(&rt->u.dst.__refcnt));
+			rt = (struct rt6_info *) dst;
+			cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+			__ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
+				  &dest->addr.in6, &dest->dst_saddr,
+				  atomic_read(&rt->dst.__refcnt));
 		}
+		if (ret_saddr)
+			ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
 		spin_unlock(&dest->dst_lock);
 	} else {
-		struct flowi fl = {
-			.oif = 0,
-			.nl_u = {
-				.ip6_u = {
-					.daddr = cp->daddr.in6,
-					.saddr = {
-						.s6_addr32 = { 0, 0, 0, 0 },
-					},
-				},
-			},
-		};
-
-		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-		if (!rt) {
-			IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
-				     &cp->daddr.in6);
+		dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
+		if (!dst)
 			return NULL;
-		}
+		rt = (struct rt6_info *) dst;
+	}
+
+	local = __ip_vs_is_local_route6(rt);
+	if (!((local ? 1 : 2) & rt_mode)) {
+		IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n",
+			     local ? "local":"non-local", daddr);
+		dst_release(&rt->dst);
+		return NULL;
+	}
+	if (local && !(rt_mode & 4) &&
+	    !((ort = (struct rt6_info *) skb_dst(skb)) &&
+	      __ip_vs_is_local_route6(ort))) {
+		IP_VS_DBG_RL("Redirect from non-local address %pI6 to local "
+			     "requires NAT method, dest: %pI6\n",
+			     &ipv6_hdr(skb)->daddr, daddr);
+		dst_release(&rt->dst);
+		return NULL;
+	}
+	if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+		     ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
+				    IPV6_ADDR_LOOPBACK)) {
+		IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 "
+			     "to non-local address, dest: %pI6\n",
+			     &ipv6_hdr(skb)->saddr, daddr);
+		dst_release(&rt->dst);
+		return NULL;
 	}
 
 	return rt;
@@ -193,12 +334,44 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
 	dst_release(old_dst);
 }
 
-#define IP_VS_XMIT(pf, skb, rt)				\
+#define IP_VS_XMIT_TUNNEL(skb, cp)				\
+({								\
+	int __ret = NF_ACCEPT;					\
+								\
+	(skb)->ipvs_property = 1;				\
+	if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT))		\
+		__ret = ip_vs_confirm_conntrack(skb, cp);	\
+	if (__ret == NF_ACCEPT) {				\
+		nf_reset(skb);					\
+		skb_forward_csum(skb);				\
+	}							\
+	__ret;							\
+})
+
+#define IP_VS_XMIT_NAT(pf, skb, cp, local)		\
+do {							\
+	(skb)->ipvs_property = 1;			\
+	if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT)))	\
+		ip_vs_notrack(skb);			\
+	else						\
+		ip_vs_update_conntrack(skb, cp, 1);	\
+	if (local)					\
+		return NF_ACCEPT;			\
+	skb_forward_csum(skb);				\
+	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\
+		skb_dst(skb)->dev, dst_output);		\
+} while (0)
+
+#define IP_VS_XMIT(pf, skb, cp, local)			\
 do {							\
 	(skb)->ipvs_property = 1;			\
+	if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT)))	\
+		ip_vs_notrack(skb);			\
+	if (local)					\
+		return NF_ACCEPT;			\
 	skb_forward_csum(skb);				\
 	NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,	\
-		(rt)->u.dst.dev, dst_output);		\
+		skb_dst(skb)->dev, dst_output);		\
 } while (0)
 
 
@@ -210,7 +383,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		struct ip_vs_protocol *pp)
 {
 	/* we do not touch skb and do not need pskb ptr */
-	return NF_ACCEPT;
+	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
 }
 
 
@@ -225,27 +398,16 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
 	struct rtable *rt;			/* Route to the other host */
 	struct iphdr  *iph = ip_hdr(skb);
-	u8     tos = iph->tos;
 	int    mtu;
-	struct flowi fl = {
-		.oif = 0,
-		.nl_u = {
-			.ip4_u = {
-				.daddr = iph->daddr,
-				.saddr = 0,
-				.tos = RT_TOS(tos), } },
-	};
 
 	EnterFunction(10);
 
-	if (ip_route_output_key(&init_net, &rt, &fl)) {
-		IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
-			     __func__, &iph->daddr);
+	if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr,
+				      RT_TOS(iph->tos), 2)))
 		goto tx_error_icmp;
-	}
 
 	/* MTU checking */
-	mtu = dst_mtu(&rt->u.dst);
+	mtu = dst_mtu(&rt->dst);
 	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
 		ip_rt_put(rt);
 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
@@ -265,12 +427,12 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	/* drop old route */
 	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -291,28 +453,22 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct rt6_info *rt;			/* Route to the other host */
 	struct ipv6hdr  *iph = ipv6_hdr(skb);
 	int    mtu;
-	struct flowi fl = {
-		.oif = 0,
-		.nl_u = {
-			.ip6_u = {
-				.daddr = iph->daddr,
-				.saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
-	};
 
 	EnterFunction(10);
 
-	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
-	if (!rt) {
-		IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
-			     __func__, &iph->daddr);
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, 2)))
 		goto tx_error_icmp;
-	}
 
 	/* MTU checking */
-	mtu = dst_mtu(&rt->u.dst);
+	mtu = dst_mtu(&rt->dst);
 	if (skb->len > mtu) {
-		dst_release(&rt->u.dst);
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		dst_release(&rt->dst);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
 		goto tx_error;
 	}
@@ -323,18 +479,18 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	 */
 	skb = skb_share_check(skb, GFP_ATOMIC);
 	if (unlikely(skb == NULL)) {
-		dst_release(&rt->u.dst);
+		dst_release(&rt->dst);
 		return NF_STOLEN;
 	}
 
 	/* drop old route */
 	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+	IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -359,6 +515,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct rtable *rt;		/* Route to the other host */
 	int mtu;
 	struct iphdr *iph = ip_hdr(skb);
+	int local;
 
 	EnterFunction(10);
 
@@ -372,36 +529,73 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}
 
-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				      RT_TOS(iph->tos), 1|2|4)))
 		goto tx_error_icmp;
+	local = rt->rt_flags & RTCF_LOCAL;
+	/*
+	 * Avoid duplicate tuple in reply direction for NAT traffic
+	 * to local address when connection is sync-ed
+	 */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct && !nf_ct_is_untracked(ct)) {
+			IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
+					 "ip_vs_nat_xmit(): "
+					 "stopping DNAT to local address");
+			goto tx_error_put;
+		}
+	}
+#endif
+
+	/* From world but DNAT to loopback address? */
+	if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
+		IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
+				 "stopping DNAT to loopback address");
+		goto tx_error_put;
+	}
 
 	/* MTU checking */
-	mtu = dst_mtu(&rt->u.dst);
+	mtu = dst_mtu(&rt->dst);
 	if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
-		ip_rt_put(rt);
 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-		IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
-		goto tx_error;
+		IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
+				 "ip_vs_nat_xmit(): frag needed for");
+		goto tx_error_put;
 	}
 
 	/* copy-on-write the packet before mangling it */
 	if (!skb_make_writable(skb, sizeof(struct iphdr)))
 		goto tx_error_put;
 
-	if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+	if (skb_cow(skb, rt->dst.dev->hard_header_len))
 		goto tx_error_put;
 
-	/* drop old route */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->u.dst);
-
 	/* mangle the packet */
 	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
-		goto tx_error;
+		goto tx_error_put;
 	ip_hdr(skb)->daddr = cp->daddr.ip;
 	ip_send_check(ip_hdr(skb));
 
-	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+	if (!local) {
+		/* drop old route */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		ip_rt_put(rt);
+		/*
+		 * Some IPv4 replies get local address from routes,
+		 * not from iph, so while we DNAT after routing
+		 * we need this second input/output route.
+		 */
+		if (!__ip_vs_reroute_locally(skb))
+			goto tx_error;
+	}
+
+	IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
 
 	/* FIXME: when application helper enlarges the packet and the length
 	   is larger than the MTU of outgoing device, there will be still
@@ -410,7 +604,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+	IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -418,8 +612,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
   tx_error_icmp:
 	dst_link_failure(skb);
   tx_error:
-	LeaveFunction(10);
 	kfree_skb(skb);
+	LeaveFunction(10);
 	return NF_STOLEN;
   tx_error_put:
 	ip_rt_put(rt);
@@ -433,6 +627,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
 	struct rt6_info *rt;		/* Route to the other host */
 	int mtu;
+	int local;
 
 	EnterFunction(10);
 
@@ -447,37 +642,73 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}
 
-	rt = __ip_vs_get_out_rt_v6(cp);
-	if (!rt)
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+					 0, 1|2|4)))
 		goto tx_error_icmp;
+	local = __ip_vs_is_local_route6(rt);
+	/*
+	 * Avoid duplicate tuple in reply direction for NAT traffic
+	 * to local address when connection is sync-ed
+	 */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct && !nf_ct_is_untracked(ct)) {
+			IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
+					 "ip_vs_nat_xmit_v6(): "
+					 "stopping DNAT to local address");
+			goto tx_error_put;
+		}
+	}
+#endif
+
+	/* From world but DNAT to loopback address? */
+	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+	    ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+		IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
+				 "ip_vs_nat_xmit_v6(): "
+				 "stopping DNAT to loopback address");
+		goto tx_error_put;
+	}
 
 	/* MTU checking */
-	mtu = dst_mtu(&rt->u.dst);
+	mtu = dst_mtu(&rt->dst);
 	if (skb->len > mtu) {
-		dst_release(&rt->u.dst);
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+		IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
 				 "ip_vs_nat_xmit_v6(): frag needed for");
-		goto tx_error;
+		goto tx_error_put;
 	}
 
 	/* copy-on-write the packet before mangling it */
 	if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
 		goto tx_error_put;
 
-	if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+	if (skb_cow(skb, rt->dst.dev->hard_header_len))
 		goto tx_error_put;
 
-	/* drop old route */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->u.dst);
-
 	/* mangle the packet */
 	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
 		goto tx_error;
-	ipv6_hdr(skb)->daddr = cp->daddr.in6;
+	ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6);
 
-	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+	if (!local || !skb->dev) {
+		/* drop the old route when skb is not shared */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		/* destined to loopback, do we need to change route? */
+		dst_release(&rt->dst);
+	}
+
+	IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
 
 	/* FIXME: when application helper enlarges the packet and the length
 	   is larger than the MTU of outgoing device, there will be still
@@ -486,7 +717,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+	IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -498,7 +729,7 @@ tx_error:
 	kfree_skb(skb);
 	return NF_STOLEN;
 tx_error_put:
-	dst_release(&rt->u.dst);
+	dst_release(&rt->dst);
 	goto tx_error;
 }
 #endif
@@ -532,30 +763,27 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct iphdr  *old_iph = ip_hdr(skb);
 	u8     tos = old_iph->tos;
 	__be16 df = old_iph->frag_off;
-	sk_buff_data_t old_transport_header = skb->transport_header;
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
 	int    mtu;
+	int ret;
 
 	EnterFunction(10);
 
-	if (skb->protocol != htons(ETH_P_IP)) {
-		IP_VS_DBG_RL("%s(): protocol error, "
-			     "ETH_P_IP: %d, skb protocol: %d\n",
-			     __func__, htons(ETH_P_IP), skb->protocol);
-		goto tx_error;
-	}
-
-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				      RT_TOS(tos), 1|2)))
 		goto tx_error_icmp;
+	if (rt->rt_flags & RTCF_LOCAL) {
+		ip_rt_put(rt);
+		IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+	}
 
-	tdev = rt->u.dst.dev;
+	tdev = rt->dst.dev;
 
-	mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
+	mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
 	if (mtu < 68) {
-		ip_rt_put(rt);
 		IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
-		goto tx_error;
+		goto tx_error_put;
 	}
 	if (skb_dst(skb))
 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
@@ -565,9 +793,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if ((old_iph->frag_off & htons(IP_DF))
 	    && mtu < ntohs(old_iph->tot_len)) {
 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
-		ip_rt_put(rt);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-		goto tx_error;
+		goto tx_error_put;
 	}
 
 	/*
@@ -590,7 +817,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		old_iph = ip_hdr(skb);
 	}
 
-	skb->transport_header = old_transport_header;
+	skb->transport_header = skb->network_header;
 
 	/* fix old IP header checksum */
 	ip_send_check(old_iph);
@@ -601,7 +828,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	/* drop old route */
 	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 
 	/*
 	 *	Push down and install the IPIP header.
@@ -615,12 +842,16 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	iph->daddr		=	rt->rt_dst;
 	iph->saddr		=	rt->rt_src;
 	iph->ttl		=	old_iph->ttl;
-	ip_select_ident(iph, &rt->u.dst, NULL);
+	ip_select_ident(iph, &rt->dst, NULL);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	ip_local_out(skb);
+	ret = IP_VS_XMIT_TUNNEL(skb, cp);
+	if (ret == NF_ACCEPT)
+		ip_local_out(skb);
+	else if (ret == NF_DROP)
+		kfree_skb(skb);
 
 	LeaveFunction(10);
 
@@ -632,6 +863,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	kfree_skb(skb);
 	LeaveFunction(10);
 	return NF_STOLEN;
+tx_error_put:
+	ip_rt_put(rt);
+	goto tx_error;
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -640,43 +874,44 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		     struct ip_vs_protocol *pp)
 {
 	struct rt6_info *rt;		/* Route to the other host */
+	struct in6_addr saddr;		/* Source for tunnel */
 	struct net_device *tdev;	/* Device to other host */
 	struct ipv6hdr  *old_iph = ipv6_hdr(skb);
-	sk_buff_data_t old_transport_header = skb->transport_header;
 	struct ipv6hdr  *iph;		/* Our new IP header */
 	unsigned int max_headroom;	/* The extra header space needed */
 	int    mtu;
+	int ret;
 
 	EnterFunction(10);
 
-	if (skb->protocol != htons(ETH_P_IPV6)) {
-		IP_VS_DBG_RL("%s(): protocol error, "
-			     "ETH_P_IPV6: %d, skb protocol: %d\n",
-			     __func__, htons(ETH_P_IPV6), skb->protocol);
-		goto tx_error;
-	}
-
-	rt = __ip_vs_get_out_rt_v6(cp);
-	if (!rt)
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
+					 &saddr, 1, 1|2)))
 		goto tx_error_icmp;
+	if (__ip_vs_is_local_route6(rt)) {
+		dst_release(&rt->dst);
+		IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+	}
 
-	tdev = rt->u.dst.dev;
+	tdev = rt->dst.dev;
 
-	mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr);
-	/* TODO IPv6: do we need this check in IPv6? */
-	if (mtu < 1280) {
-		dst_release(&rt->u.dst);
-		IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
-		goto tx_error;
+	mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
+	if (mtu < IPV6_MIN_MTU) {
+		IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+			     IPV6_MIN_MTU);
+		goto tx_error_put;
 	}
 	if (skb_dst(skb))
 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 
 	if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		dst_release(&rt->u.dst);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-		goto tx_error;
+		goto tx_error_put;
 	}
 
 	/*
@@ -689,7 +924,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		struct sk_buff *new_skb =
 			skb_realloc_headroom(skb, max_headroom);
 		if (!new_skb) {
-			dst_release(&rt->u.dst);
+			dst_release(&rt->dst);
 			kfree_skb(skb);
 			IP_VS_ERR_RL("%s(): no memory\n", __func__);
 			return NF_STOLEN;
@@ -699,7 +934,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		old_iph = ipv6_hdr(skb);
 	}
 
-	skb->transport_header = old_transport_header;
+	skb->transport_header = skb->network_header;
 
 	skb_push(skb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(skb);
@@ -707,7 +942,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	/* drop old route */
 	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 
 	/*
 	 *	Push down and install the IPIP header.
@@ -719,14 +954,18 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
 	iph->priority		=	old_iph->priority;
 	memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
-	iph->daddr		=	rt->rt6i_dst.addr;
-	iph->saddr		=	cp->vaddr.in6; /* rt->rt6i_src.addr; */
+	ipv6_addr_copy(&iph->daddr, &cp->daddr.in6);
+	ipv6_addr_copy(&iph->saddr, &saddr);
 	iph->hop_limit		=	old_iph->hop_limit;
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	ip6_local_out(skb);
+	ret = IP_VS_XMIT_TUNNEL(skb, cp);
+	if (ret == NF_ACCEPT)
+		ip6_local_out(skb);
+	else if (ret == NF_DROP)
+		kfree_skb(skb);
 
 	LeaveFunction(10);
 
@@ -738,6 +977,9 @@ tx_error:
 	kfree_skb(skb);
 	LeaveFunction(10);
 	return NF_STOLEN;
+tx_error_put:
+	dst_release(&rt->dst);
+	goto tx_error;
 }
 #endif
 
@@ -756,11 +998,16 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				      RT_TOS(iph->tos), 1|2)))
 		goto tx_error_icmp;
+	if (rt->rt_flags & RTCF_LOCAL) {
+		ip_rt_put(rt);
+		IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+	}
 
 	/* MTU checking */
-	mtu = dst_mtu(&rt->u.dst);
+	mtu = dst_mtu(&rt->dst);
 	if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
 		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
 		ip_rt_put(rt);
@@ -780,12 +1027,12 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	/* drop old route */
 	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+	IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -808,15 +1055,24 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	EnterFunction(10);
 
-	rt = __ip_vs_get_out_rt_v6(cp);
-	if (!rt)
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+					 0, 1|2)))
 		goto tx_error_icmp;
+	if (__ip_vs_is_local_route6(rt)) {
+		dst_release(&rt->dst);
+		IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+	}
 
 	/* MTU checking */
-	mtu = dst_mtu(&rt->u.dst);
+	mtu = dst_mtu(&rt->dst);
 	if (skb->len > mtu) {
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		dst_release(&rt->u.dst);
+		dst_release(&rt->dst);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
 		goto tx_error;
 	}
@@ -827,18 +1083,18 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	 */
 	skb = skb_share_check(skb, GFP_ATOMIC);
 	if (unlikely(skb == NULL)) {
-		dst_release(&rt->u.dst);
+		dst_release(&rt->dst);
 		return NF_STOLEN;
 	}
 
 	/* drop old route */
 	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->u.dst);
+	skb_dst_set(skb, &rt->dst);
 
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+	IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
 
 	LeaveFunction(10);
 	return NF_STOLEN;
@@ -864,6 +1120,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct rtable	*rt;	/* Route to the other host */
 	int mtu;
 	int rc;
+	int local;
 
 	EnterFunction(10);
 
@@ -884,35 +1141,73 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	 * mangle and send the packet here (only for VS/NAT)
 	 */
 
-	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
+	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+				      RT_TOS(ip_hdr(skb)->tos), 1|2|4)))
 		goto tx_error_icmp;
+	local = rt->rt_flags & RTCF_LOCAL;
+
+	/*
+	 * Avoid duplicate tuple in reply direction for NAT traffic
+	 * to local address when connection is sync-ed
+	 */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct && !nf_ct_is_untracked(ct)) {
+			IP_VS_DBG(10, "%s(): "
+				  "stopping DNAT to local address %pI4\n",
+				  __func__, &cp->daddr.ip);
+			goto tx_error_put;
+		}
+	}
+#endif
+
+	/* From world but DNAT to loopback address? */
+	if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) {
+		IP_VS_DBG(1, "%s(): "
+			  "stopping DNAT to loopback %pI4\n",
+			  __func__, &cp->daddr.ip);
+		goto tx_error_put;
+	}
 
 	/* MTU checking */
-	mtu = dst_mtu(&rt->u.dst);
+	mtu = dst_mtu(&rt->dst);
 	if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
-		ip_rt_put(rt);
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-		goto tx_error;
+		goto tx_error_put;
 	}
 
 	/* copy-on-write the packet before mangling it */
 	if (!skb_make_writable(skb, offset))
 		goto tx_error_put;
 
-	if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+	if (skb_cow(skb, rt->dst.dev->hard_header_len))
 		goto tx_error_put;
 
-	/* drop the old route when skb is not shared */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->u.dst);
-
 	ip_vs_nat_icmp(skb, pp, cp, 0);
 
+	if (!local) {
+		/* drop the old route when skb is not shared */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		ip_rt_put(rt);
+		/*
+		 * Some IPv4 replies get local address from routes,
+		 * not from iph, so while we DNAT after routing
+		 * we need this second input/output route.
+		 */
+		if (!__ip_vs_reroute_locally(skb))
+			goto tx_error;
+	}
+
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+	IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
 
 	rc = NF_STOLEN;
 	goto out;
@@ -938,6 +1233,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct rt6_info	*rt;	/* Route to the other host */
 	int mtu;
 	int rc;
+	int local;
 
 	EnterFunction(10);
 
@@ -958,36 +1254,73 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	 * mangle and send the packet here (only for VS/NAT)
 	 */
 
-	rt = __ip_vs_get_out_rt_v6(cp);
-	if (!rt)
+	if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+					 0, 1|2|4)))
 		goto tx_error_icmp;
 
+	local = __ip_vs_is_local_route6(rt);
+	/*
+	 * Avoid duplicate tuple in reply direction for NAT traffic
+	 * to local address when connection is sync-ed
+	 */
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct && !nf_ct_is_untracked(ct)) {
+			IP_VS_DBG(10, "%s(): "
+				  "stopping DNAT to local address %pI6\n",
+				  __func__, &cp->daddr.in6);
+			goto tx_error_put;
+		}
+	}
+#endif
+
+	/* From world but DNAT to loopback address? */
+	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+	    ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+		IP_VS_DBG(1, "%s(): "
+			  "stopping DNAT to loopback %pI6\n",
+			  __func__, &cp->daddr.in6);
+		goto tx_error_put;
+	}
+
 	/* MTU checking */
-	mtu = dst_mtu(&rt->u.dst);
+	mtu = dst_mtu(&rt->dst);
 	if (skb->len > mtu) {
-		dst_release(&rt->u.dst);
+		if (!skb->dev) {
+			struct net *net = dev_net(skb_dst(skb)->dev);
+
+			skb->dev = net->loopback_dev;
+		}
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		IP_VS_DBG_RL("%s(): frag needed\n", __func__);
-		goto tx_error;
+		goto tx_error_put;
 	}
 
 	/* copy-on-write the packet before mangling it */
 	if (!skb_make_writable(skb, offset))
 		goto tx_error_put;
 
-	if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+	if (skb_cow(skb, rt->dst.dev->hard_header_len))
 		goto tx_error_put;
 
-	/* drop the old route when skb is not shared */
-	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->u.dst);
-
 	ip_vs_nat_icmp_v6(skb, pp, cp, 0);
 
+	if (!local || !skb->dev) {
+		/* drop the old route when skb is not shared */
+		skb_dst_drop(skb);
+		skb_dst_set(skb, &rt->dst);
+	} else {
+		/* destined to loopback, do we need to change route? */
+		dst_release(&rt->dst);
+	}
+
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->local_df = 1;
 
-	IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+	IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
 
 	rc = NF_STOLEN;
 	goto out;
@@ -1001,7 +1334,7 @@ out:
 	LeaveFunction(10);
 	return rc;
 tx_error_put:
-	dst_release(&rt->u.dst);
+	dst_release(&rt->dst);
 	goto tx_error;
 }
 #endif
diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
index ab81b380eae..5178c691ecb 100644
--- a/net/netfilter/nf_conntrack_acct.c
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -17,13 +17,7 @@
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_acct.h>
 
-#ifdef CONFIG_NF_CT_ACCT
-#define NF_CT_ACCT_DEFAULT 1
-#else
-#define NF_CT_ACCT_DEFAULT 0
-#endif
-
-static int nf_ct_acct __read_mostly = NF_CT_ACCT_DEFAULT;
+static int nf_ct_acct __read_mostly;
 
 module_param_named(acct, nf_ct_acct, bool, 0644);
 MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting.");
@@ -114,12 +108,6 @@ int nf_conntrack_acct_init(struct net *net)
 	net->ct.sysctl_acct = nf_ct_acct;
 
 	if (net_eq(net, &init_net)) {
-#ifdef CONFIG_NF_CT_ACCT
-	printk(KERN_WARNING "CONFIG_NF_CT_ACCT is deprecated and will be removed soon. Please use\n");
-		printk(KERN_WARNING "nf_conntrack.acct=1 kernel parameter, acct=1 nf_conntrack module option or\n");
-		printk(KERN_WARNING "sysctl net.netfilter.nf_conntrack_acct=1 to enable it.\n");
-#endif
-
 		ret = nf_ct_extend_register(&acct_extend);
 		if (ret < 0) {
 			printk(KERN_ERR "nf_conntrack_acct: Unable to register extension\n");
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index eeeb8bc7398..1eacf8d9966 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -62,35 +62,45 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 unsigned int nf_conntrack_max __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_max);
 
-struct nf_conn nf_conntrack_untracked __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
+DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
+EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
 
-static int nf_conntrack_hash_rnd_initted;
-static unsigned int nf_conntrack_hash_rnd;
+static unsigned int nf_conntrack_hash_rnd __read_mostly;
 
-static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
-				  u16 zone, unsigned int size, unsigned int rnd)
+static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone)
 {
 	unsigned int n;
-	u_int32_t h;
 
 	/* The direction must be ignored, so we hash everything up to the
 	 * destination ports (which is a multiple of 4) and treat the last
 	 * three bytes manually.
 	 */
 	n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
-	h = jhash2((u32 *)tuple, n,
-		   zone ^ rnd ^ (((__force __u16)tuple->dst.u.all << 16) |
-				 tuple->dst.protonum));
+	return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^
+		      (((__force __u16)tuple->dst.u.all << 16) |
+		      tuple->dst.protonum));
+}
+
+static u32 __hash_bucket(u32 hash, unsigned int size)
+{
+	return ((u64)hash * size) >> 32;
+}
+
+static u32 hash_bucket(u32 hash, const struct net *net)
+{
+	return __hash_bucket(hash, net->ct.htable_size);
+}
 
-	return ((u64)h * size) >> 32;
+static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
+				  u16 zone, unsigned int size)
+{
+	return __hash_bucket(hash_conntrack_raw(tuple, zone), size);
 }
 
 static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
 				       const struct nf_conntrack_tuple *tuple)
 {
-	return __hash_conntrack(tuple, zone, net->ct.htable_size,
-				nf_conntrack_hash_rnd);
+	return __hash_conntrack(tuple, zone, net->ct.htable_size);
 }
 
 bool
@@ -292,20 +302,20 @@ static void death_by_timeout(unsigned long ul_conntrack)
  * OR
  * - Caller must lock nf_conntrack_lock before calling this function
  */
-struct nf_conntrack_tuple_hash *
-__nf_conntrack_find(struct net *net, u16 zone,
-		    const struct nf_conntrack_tuple *tuple)
+static struct nf_conntrack_tuple_hash *
+____nf_conntrack_find(struct net *net, u16 zone,
+		      const struct nf_conntrack_tuple *tuple, u32 hash)
 {
 	struct nf_conntrack_tuple_hash *h;
 	struct hlist_nulls_node *n;
-	unsigned int hash = hash_conntrack(net, zone, tuple);
+	unsigned int bucket = hash_bucket(hash, net);
 
 	/* Disable BHs the entire time since we normally need to disable them
 	 * at least once for the stats anyway.
 	 */
 	local_bh_disable();
 begin:
-	hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
+	hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
 		if (nf_ct_tuple_equal(tuple, &h->tuple) &&
 		    nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) {
 			NF_CT_STAT_INC(net, found);
@@ -319,7 +329,7 @@ begin:
 	 * not the expected one, we must restart lookup.
 	 * We probably met an item that was moved to another chain.
 	 */
-	if (get_nulls_value(n) != hash) {
+	if (get_nulls_value(n) != bucket) {
 		NF_CT_STAT_INC(net, search_restart);
 		goto begin;
 	}
@@ -327,19 +337,27 @@ begin:
 
 	return NULL;
 }
+
+struct nf_conntrack_tuple_hash *
+__nf_conntrack_find(struct net *net, u16 zone,
+		    const struct nf_conntrack_tuple *tuple)
+{
+	return ____nf_conntrack_find(net, zone, tuple,
+				     hash_conntrack_raw(tuple, zone));
+}
 EXPORT_SYMBOL_GPL(__nf_conntrack_find);
 
 /* Find a connection corresponding to a tuple. */
-struct nf_conntrack_tuple_hash *
-nf_conntrack_find_get(struct net *net, u16 zone,
-		      const struct nf_conntrack_tuple *tuple)
+static struct nf_conntrack_tuple_hash *
+__nf_conntrack_find_get(struct net *net, u16 zone,
+			const struct nf_conntrack_tuple *tuple, u32 hash)
 {
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
 
 	rcu_read_lock();
 begin:
-	h = __nf_conntrack_find(net, zone, tuple);
+	h = ____nf_conntrack_find(net, zone, tuple, hash);
 	if (h) {
 		ct = nf_ct_tuplehash_to_ctrack(h);
 		if (unlikely(nf_ct_is_dying(ct) ||
@@ -357,6 +375,14 @@ begin:
 
 	return h;
 }
+
+struct nf_conntrack_tuple_hash *
+nf_conntrack_find_get(struct net *net, u16 zone,
+		      const struct nf_conntrack_tuple *tuple)
+{
+	return __nf_conntrack_find_get(net, zone, tuple,
+				       hash_conntrack_raw(tuple, zone));
+}
 EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 
 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
@@ -409,8 +435,11 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 		return NF_ACCEPT;
 
 	zone = nf_ct_zone(ct);
-	hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-	repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+	/* reuse the hash saved before */
+	hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
+	hash = hash_bucket(hash, net);
+	repl_hash = hash_conntrack(net, zone,
+				   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 
 	/* We're not in hash table, and we refuse to set up related
 	   connections for unconfirmed conns.  But packet copies and
@@ -567,17 +596,29 @@ static noinline int early_drop(struct net *net, unsigned int hash)
 	return dropped;
 }
 
-struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
-				   const struct nf_conntrack_tuple *orig,
-				   const struct nf_conntrack_tuple *repl,
-				   gfp_t gfp)
+static struct nf_conn *
+__nf_conntrack_alloc(struct net *net, u16 zone,
+		     const struct nf_conntrack_tuple *orig,
+		     const struct nf_conntrack_tuple *repl,
+		     gfp_t gfp, u32 hash)
 {
 	struct nf_conn *ct;
 
-	if (unlikely(!nf_conntrack_hash_rnd_initted)) {
-		get_random_bytes(&nf_conntrack_hash_rnd,
-				sizeof(nf_conntrack_hash_rnd));
-		nf_conntrack_hash_rnd_initted = 1;
+	if (unlikely(!nf_conntrack_hash_rnd)) {
+		unsigned int rand;
+
+		/*
+		 * Why not initialize nf_conntrack_rnd in a "init()" function ?
+		 * Because there isn't enough entropy when system initializing,
+		 * and we initialize it as late as possible.
+		 */
+		do {
+			get_random_bytes(&rand, sizeof(rand));
+		} while (!rand);
+		cmpxchg(&nf_conntrack_hash_rnd, 0, rand);
+
+		/* recompute the hash as nf_conntrack_hash_rnd is initialized */
+		hash = hash_conntrack_raw(orig, zone);
 	}
 
 	/* We don't want any race condition at early drop stage */
@@ -585,8 +626,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
 
 	if (nf_conntrack_max &&
 	    unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
-		unsigned int hash = hash_conntrack(net, zone, orig);
-		if (!early_drop(net, hash)) {
+		if (!early_drop(net, hash_bucket(hash, net))) {
 			atomic_dec(&net->ct.count);
 			if (net_ratelimit())
 				printk(KERN_WARNING
@@ -616,12 +656,11 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
 	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
 	ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
 	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
-	ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL;
+	/* save hash for reusing when confirming */
+	*(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
 	/* Don't set timer yet: wait for confirmation */
 	setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
-#ifdef CONFIG_NET_NS
-	ct->ct_net = net;
-#endif
+	write_pnet(&ct->ct_net, net);
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 	if (zone) {
 		struct nf_conntrack_zone *nf_ct_zone;
@@ -645,6 +684,14 @@ out_free:
 	return ERR_PTR(-ENOMEM);
 #endif
 }
+
+struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
+				   const struct nf_conntrack_tuple *orig,
+				   const struct nf_conntrack_tuple *repl,
+				   gfp_t gfp)
+{
+	return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
+}
 EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
 
 void nf_conntrack_free(struct nf_conn *ct)
@@ -666,7 +713,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	       struct nf_conntrack_l3proto *l3proto,
 	       struct nf_conntrack_l4proto *l4proto,
 	       struct sk_buff *skb,
-	       unsigned int dataoff)
+	       unsigned int dataoff, u32 hash)
 {
 	struct nf_conn *ct;
 	struct nf_conn_help *help;
@@ -680,7 +727,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 		return NULL;
 	}
 
-	ct = nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC);
+	ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
+				  hash);
 	if (IS_ERR(ct)) {
 		pr_debug("Can't allocate conntrack.\n");
 		return (struct nf_conntrack_tuple_hash *)ct;
@@ -757,6 +805,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
 	u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
+	u32 hash;
 
 	if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
 			     dataoff, l3num, protonum, &tuple, l3proto,
@@ -766,10 +815,11 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	}
 
 	/* look for tuple match */
-	h = nf_conntrack_find_get(net, zone, &tuple);
+	hash = hash_conntrack_raw(&tuple, zone);
+	h = __nf_conntrack_find_get(net, zone, &tuple, hash);
 	if (!h) {
 		h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
-				   skb, dataoff);
+				   skb, dataoff, hash);
 		if (!h)
 			return NULL;
 		if (IS_ERR(h))
@@ -968,8 +1018,7 @@ acct:
 		if (acct) {
 			spin_lock_bh(&ct->lock);
 			acct[CTINFO2DIR(ctinfo)].packets++;
-			acct[CTINFO2DIR(ctinfo)].bytes +=
-				skb->len - skb_network_offset(skb);
+			acct[CTINFO2DIR(ctinfo)].bytes += skb->len;
 			spin_unlock_bh(&ct->lock);
 		}
 	}
@@ -1183,10 +1232,21 @@ static void nf_ct_release_dying_list(struct net *net)
 	spin_unlock_bh(&nf_conntrack_lock);
 }
 
+static int untrack_refs(void)
+{
+	int cnt = 0, cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
+
+		cnt += atomic_read(&ct->ct_general.use) - 1;
+	}
+	return cnt;
+}
+
 static void nf_conntrack_cleanup_init_net(void)
 {
-	/* wait until all references to nf_conntrack_untracked are dropped */
-	while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
+	while (untrack_refs() > 0)
 		schedule();
 
 	nf_conntrack_helper_fini();
@@ -1299,8 +1359,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
 			ct = nf_ct_tuplehash_to_ctrack(h);
 			hlist_nulls_del_rcu(&h->hnnode);
 			bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
-						  hashsize,
-						  nf_conntrack_hash_rnd);
+						  hashsize);
 			hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
 		}
 	}
@@ -1321,10 +1380,19 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
 		  &nf_conntrack_htable_size, 0600);
 
+void nf_ct_untracked_status_or(unsigned long bits)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		per_cpu(nf_conntrack_untracked, cpu).status |= bits;
+}
+EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
+
 static int nf_conntrack_init_init_net(void)
 {
 	int max_factor = 8;
-	int ret;
+	int ret, cpu;
 
 	/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
 	 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
@@ -1363,13 +1431,13 @@ static int nf_conntrack_init_init_net(void)
 		goto err_extend;
 #endif
 	/* Set up fake conntrack: to never be deleted, not in any hashes */
-#ifdef CONFIG_NET_NS
-	nf_conntrack_untracked.ct_net = &init_net;
-#endif
-	atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
+	for_each_possible_cpu(cpu) {
+		struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
+		write_pnet(&ct->ct_net, &init_net);
+		atomic_set(&ct->ct_general.use, 1);
+	}
 	/*  - and look it like as a confirmed connection */
-	set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
-
+	nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
 	return 0;
 
 #ifdef CONFIG_NF_CONNTRACK_ZONES
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index cdcc7649476..5702de35e2b 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -26,10 +26,10 @@
 
 static DEFINE_MUTEX(nf_ct_ecache_mutex);
 
-struct nf_ct_event_notifier *nf_conntrack_event_cb __read_mostly;
+struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_event_cb);
 
-struct nf_exp_event_notifier *nf_expect_event_cb __read_mostly;
+struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly;
 EXPORT_SYMBOL_GPL(nf_expect_event_cb);
 
 /* deliver cached events and clear cache entry - must be called with locally
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index acb29ccaa41..46e8966912b 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -38,25 +38,30 @@ static int nf_ct_expect_hash_rnd_initted __read_mostly;
 
 static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
 
+static HLIST_HEAD(nf_ct_userspace_expect_list);
+
 /* nf_conntrack_expect helper functions */
-void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
+void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
+				u32 pid, int report)
 {
 	struct nf_conn_help *master_help = nfct_help(exp->master);
 	struct net *net = nf_ct_exp_net(exp);
 
-	NF_CT_ASSERT(master_help);
 	NF_CT_ASSERT(!timer_pending(&exp->timeout));
 
 	hlist_del_rcu(&exp->hnode);
 	net->ct.expect_count--;
 
 	hlist_del(&exp->lnode);
-	master_help->expecting[exp->class]--;
+	if (!(exp->flags & NF_CT_EXPECT_USERSPACE))
+		master_help->expecting[exp->class]--;
+
+	nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report);
 	nf_ct_expect_put(exp);
 
 	NF_CT_STAT_INC(net, expect_delete);
 }
-EXPORT_SYMBOL_GPL(nf_ct_unlink_expect);
+EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
 
 static void nf_ct_expectation_timed_out(unsigned long ul_expect)
 {
@@ -320,16 +325,21 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
 
 	atomic_inc(&exp->use);
 
-	hlist_add_head(&exp->lnode, &master_help->expectations);
-	master_help->expecting[exp->class]++;
+	if (master_help) {
+		hlist_add_head(&exp->lnode, &master_help->expectations);
+		master_help->expecting[exp->class]++;
+	} else if (exp->flags & NF_CT_EXPECT_USERSPACE)
+		hlist_add_head(&exp->lnode, &nf_ct_userspace_expect_list);
 
 	hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
 	net->ct.expect_count++;
 
 	setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
 		    (unsigned long)exp);
-	p = &master_help->helper->expect_policy[exp->class];
-	exp->timeout.expires = jiffies + p->timeout * HZ;
+	if (master_help) {
+		p = &master_help->helper->expect_policy[exp->class];
+		exp->timeout.expires = jiffies + p->timeout * HZ;
+	}
 	add_timer(&exp->timeout);
 
 	atomic_inc(&exp->use);
@@ -380,7 +390,9 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
 	unsigned int h;
 	int ret = 1;
 
-	if (!master_help->helper) {
+	/* Don't allow expectations created from kernel-space with no helper */
+	if (!(expect->flags & NF_CT_EXPECT_USERSPACE) &&
+	    (!master_help || (master_help && !master_help->helper))) {
 		ret = -ESHUTDOWN;
 		goto out;
 	}
@@ -398,13 +410,16 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
 		}
 	}
 	/* Will be over limit? */
-	p = &master_help->helper->expect_policy[expect->class];
-	if (p->max_expected &&
-	    master_help->expecting[expect->class] >= p->max_expected) {
-		evict_oldest_expect(master, expect);
-		if (master_help->expecting[expect->class] >= p->max_expected) {
-			ret = -EMFILE;
-			goto out;
+	if (master_help) {
+		p = &master_help->helper->expect_policy[expect->class];
+		if (p->max_expected &&
+		    master_help->expecting[expect->class] >= p->max_expected) {
+			evict_oldest_expect(master, expect);
+			if (master_help->expecting[expect->class]
+						>= p->max_expected) {
+				ret = -EMFILE;
+				goto out;
+			}
 		}
 	}
 
@@ -439,6 +454,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
 
+void nf_ct_remove_userspace_expectations(void)
+{
+	struct nf_conntrack_expect *exp;
+	struct hlist_node *n, *next;
+
+	hlist_for_each_entry_safe(exp, n, next,
+				  &nf_ct_userspace_expect_list, lnode) {
+		if (del_timer(&exp->timeout)) {
+			nf_ct_unlink_expect(exp);
+			nf_ct_expect_put(exp);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(nf_ct_remove_userspace_expectations);
+
 #ifdef CONFIG_PROC_FS
 struct ct_expect_iter_state {
 	struct seq_net_private p;
@@ -529,8 +559,12 @@ static int exp_seq_show(struct seq_file *s, void *v)
 		seq_printf(s, "PERMANENT");
 		delim = ",";
 	}
-	if (expect->flags & NF_CT_EXPECT_INACTIVE)
+	if (expect->flags & NF_CT_EXPECT_INACTIVE) {
 		seq_printf(s, "%sINACTIVE", delim);
+		delim = ",";
+	}
+	if (expect->flags & NF_CT_EXPECT_USERSPACE)
+		seq_printf(s, "%sUSERSPACE", delim);
 
 	helper = rcu_dereference(nfct_help(expect->master)->helper);
 	if (helper) {
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index fdc8fb4ae10..bd82450c193 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -16,16 +16,17 @@
 #include <linux/skbuff.h>
 #include <net/netfilter/nf_conntrack_extend.h>
 
-static struct nf_ct_ext_type *nf_ct_ext_types[NF_CT_EXT_NUM];
+static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
 static DEFINE_MUTEX(nf_ct_ext_type_mutex);
 
 void __nf_ct_ext_destroy(struct nf_conn *ct)
 {
 	unsigned int i;
 	struct nf_ct_ext_type *t;
+	struct nf_ct_ext *ext = ct->ext;
 
 	for (i = 0; i < NF_CT_EXT_NUM; i++) {
-		if (!nf_ct_ext_exist(ct, i))
+		if (!__nf_ct_ext_exist(ext, i))
 			continue;
 
 		rcu_read_lock();
@@ -47,15 +48,17 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp)
 {
 	unsigned int off, len;
 	struct nf_ct_ext_type *t;
+	size_t alloc_size;
 
 	rcu_read_lock();
 	t = rcu_dereference(nf_ct_ext_types[id]);
 	BUG_ON(t == NULL);
 	off = ALIGN(sizeof(struct nf_ct_ext), t->align);
 	len = off + t->len;
+	alloc_size = t->alloc_size;
 	rcu_read_unlock();
 
-	*ext = kzalloc(t->alloc_size, gfp);
+	*ext = kzalloc(alloc_size, gfp);
 	if (!*ext)
 		return NULL;
 
@@ -73,44 +76,45 @@ static void __nf_ct_ext_free_rcu(struct rcu_head *head)
 
 void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
 {
-	struct nf_ct_ext *new;
+	struct nf_ct_ext *old, *new;
 	int i, newlen, newoff;
 	struct nf_ct_ext_type *t;
 
 	/* Conntrack must not be confirmed to avoid races on reallocation. */
 	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
 
-	if (!ct->ext)
+	old = ct->ext;
+	if (!old)
 		return nf_ct_ext_create(&ct->ext, id, gfp);
 
-	if (nf_ct_ext_exist(ct, id))
+	if (__nf_ct_ext_exist(old, id))
 		return NULL;
 
 	rcu_read_lock();
 	t = rcu_dereference(nf_ct_ext_types[id]);
 	BUG_ON(t == NULL);
 
-	newoff = ALIGN(ct->ext->len, t->align);
+	newoff = ALIGN(old->len, t->align);
 	newlen = newoff + t->len;
 	rcu_read_unlock();
 
-	new = __krealloc(ct->ext, newlen, gfp);
+	new = __krealloc(old, newlen, gfp);
 	if (!new)
 		return NULL;
 
-	if (new != ct->ext) {
+	if (new != old) {
 		for (i = 0; i < NF_CT_EXT_NUM; i++) {
-			if (!nf_ct_ext_exist(ct, i))
+			if (!__nf_ct_ext_exist(old, i))
 				continue;
 
 			rcu_read_lock();
 			t = rcu_dereference(nf_ct_ext_types[i]);
 			if (t && t->move)
 				t->move((void *)new + new->offset[i],
-					(void *)ct->ext + ct->ext->offset[i]);
+					(void *)old + old->offset[i]);
 			rcu_read_unlock();
 		}
-		call_rcu(&ct->ext->rcu, __nf_ct_ext_free_rcu);
+		call_rcu(&old->rcu, __nf_ct_ext_free_rcu);
 		ct->ext = new;
 	}
 
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 6eaee7c8a33..b969025cf82 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -734,11 +734,11 @@ static int callforward_do_filter(const union nf_inet_addr *src,
 		if (!afinfo->route((struct dst_entry **)&rt1, &fl1)) {
 			if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) {
 				if (rt1->rt_gateway == rt2->rt_gateway &&
-				    rt1->u.dst.dev  == rt2->u.dst.dev)
+				    rt1->dst.dev  == rt2->dst.dev)
 					ret = 1;
-				dst_release(&rt2->u.dst);
+				dst_release(&rt2->dst);
 			}
-			dst_release(&rt1->u.dst);
+			dst_release(&rt1->dst);
 		}
 		break;
 	}
@@ -753,11 +753,11 @@ static int callforward_do_filter(const union nf_inet_addr *src,
 			if (!afinfo->route((struct dst_entry **)&rt2, &fl2)) {
 				if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway,
 					    sizeof(rt1->rt6i_gateway)) &&
-				    rt1->u.dst.dev == rt2->u.dst.dev)
+				    rt1->dst.dev == rt2->dst.dev)
 					ret = 1;
-				dst_release(&rt2->u.dst);
+				dst_release(&rt2->dst);
 			}
-			dst_release(&rt1->u.dst);
+			dst_release(&rt1->dst);
 		}
 		break;
 	}
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
index 497b2224536..aadde018a07 100644
--- a/net/netfilter/nf_conntrack_netbios_ns.c
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -61,7 +61,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
 		goto out;
 
 	rcu_read_lock();
-	in_dev = __in_dev_get_rcu(rt->u.dst.dev);
+	in_dev = __in_dev_get_rcu(rt->dst.dev);
 	if (in_dev != NULL) {
 		for_primary_ifa(in_dev) {
 			if (ifa->ifa_broadcast == iph->daddr) {
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index c42ff6aa441..b729ace1dcc 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -22,6 +22,7 @@
 #include <linux/rculist_nulls.h>
 #include <linux/types.h>
 #include <linux/timer.h>
+#include <linux/security.h>
 #include <linux/skbuff.h>
 #include <linux/errno.h>
 #include <linux/netlink.h>
@@ -245,16 +246,31 @@ nla_put_failure:
 
 #ifdef CONFIG_NF_CONNTRACK_SECMARK
 static inline int
-ctnetlink_dump_secmark(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
 {
-	NLA_PUT_BE32(skb, CTA_SECMARK, htonl(ct->secmark));
-	return 0;
+	struct nlattr *nest_secctx;
+	int len, ret;
+	char *secctx;
+
+	ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+	if (ret)
+		return ret;
+
+	ret = -1;
+	nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED);
+	if (!nest_secctx)
+		goto nla_put_failure;
 
+	NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx);
+	nla_nest_end(skb, nest_secctx);
+
+	ret = 0;
 nla_put_failure:
-	return -1;
+	security_release_secctx(secctx, len);
+	return ret;
 }
 #else
-#define ctnetlink_dump_secmark(a, b) (0)
+#define ctnetlink_dump_secctx(a, b) (0)
 #endif
 
 #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
@@ -391,7 +407,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
 	    ctnetlink_dump_protoinfo(skb, ct) < 0 ||
 	    ctnetlink_dump_helpinfo(skb, ct) < 0 ||
 	    ctnetlink_dump_mark(skb, ct) < 0 ||
-	    ctnetlink_dump_secmark(skb, ct) < 0 ||
+	    ctnetlink_dump_secctx(skb, ct) < 0 ||
 	    ctnetlink_dump_id(skb, ct) < 0 ||
 	    ctnetlink_dump_use(skb, ct) < 0 ||
 	    ctnetlink_dump_master(skb, ct) < 0 ||
@@ -437,6 +453,17 @@ ctnetlink_counters_size(const struct nf_conn *ct)
 	       ;
 }
 
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ctnetlink_nlmsg_secctx_size(const struct nf_conn *ct)
+{
+	int len;
+
+	security_secid_to_secctx(ct->secmark, NULL, &len);
+
+	return sizeof(char) * len;
+}
+#endif
+
 static inline size_t
 ctnetlink_nlmsg_size(const struct nf_conn *ct)
 {
@@ -453,7 +480,8 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
 	       + nla_total_size(0) /* CTA_HELP */
 	       + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
 #ifdef CONFIG_NF_CONNTRACK_SECMARK
-	       + nla_total_size(sizeof(u_int32_t)) /* CTA_SECMARK */
+	       + nla_total_size(0) /* CTA_SECCTX */
+	       + nla_total_size(ctnetlink_nlmsg_secctx_size(ct)) /* CTA_SECCTX_NAME */
 #endif
 #ifdef CONFIG_NF_NAT_NEEDED
 	       + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
@@ -480,7 +508,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 	int err;
 
 	/* ignore our fake conntrack entry */
-	if (ct == &nf_conntrack_untracked)
+	if (nf_ct_is_untracked(ct))
 		return 0;
 
 	if (events & (1 << IPCT_DESTROY)) {
@@ -556,7 +584,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 
 #ifdef CONFIG_NF_CONNTRACK_SECMARK
 		if ((events & (1 << IPCT_SECMARK) || ct->secmark)
-		    && ctnetlink_dump_secmark(skb, ct) < 0)
+		    && ctnetlink_dump_secctx(skb, ct) < 0)
 			goto nla_put_failure;
 #endif
 
@@ -1560,8 +1588,8 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
 			  const struct nf_conntrack_expect *exp)
 {
 	struct nf_conn *master = exp->master;
-	struct nf_conntrack_helper *helper;
 	long timeout = (exp->timeout.expires - jiffies) / HZ;
+	struct nf_conn_help *help;
 
 	if (timeout < 0)
 		timeout = 0;
@@ -1577,9 +1605,15 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
 
 	NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout));
 	NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp));
-	helper = rcu_dereference(nfct_help(master)->helper);
-	if (helper)
-		NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
+	NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags));
+	help = nfct_help(master);
+	if (help) {
+		struct nf_conntrack_helper *helper;
+
+		helper = rcu_dereference(help->helper);
+		if (helper)
+			NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name);
+	}
 
 	return 0;
 
@@ -1626,17 +1660,20 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
 	struct sk_buff *skb;
-	unsigned int type;
+	unsigned int type, group;
 	int flags = 0;
 
-	if (events & (1 << IPEXP_NEW)) {
+	if (events & (1 << IPEXP_DESTROY)) {
+		type = IPCTNL_MSG_EXP_DELETE;
+		group = NFNLGRP_CONNTRACK_EXP_DESTROY;
+	} else if (events & (1 << IPEXP_NEW)) {
 		type = IPCTNL_MSG_EXP_NEW;
 		flags = NLM_F_CREATE|NLM_F_EXCL;
+		group = NFNLGRP_CONNTRACK_EXP_NEW;
 	} else
 		return 0;
 
-	if (!item->report &&
-	    !nfnetlink_has_listeners(net, NFNLGRP_CONNTRACK_EXP_NEW))
+	if (!item->report && !nfnetlink_has_listeners(net, group))
 		return 0;
 
 	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
@@ -1659,8 +1696,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
 	rcu_read_unlock();
 
 	nlmsg_end(skb, nlh);
-	nfnetlink_send(skb, net, item->pid, NFNLGRP_CONNTRACK_EXP_NEW,
-		       item->report, GFP_ATOMIC);
+	nfnetlink_send(skb, net, item->pid, group, item->report, GFP_ATOMIC);
 	return 0;
 
 nla_put_failure:
@@ -1733,6 +1769,8 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
 	[CTA_EXPECT_TIMEOUT]	= { .type = NLA_U32 },
 	[CTA_EXPECT_ID]		= { .type = NLA_U32 },
 	[CTA_EXPECT_HELP_NAME]	= { .type = NLA_NUL_STRING },
+	[CTA_EXPECT_ZONE]	= { .type = NLA_U16 },
+	[CTA_EXPECT_FLAGS]	= { .type = NLA_U32 },
 };
 
 static int
@@ -1841,7 +1879,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 		}
 
 		/* after list removal, usage count == 1 */
-		nf_ct_unexpect_related(exp);
+		spin_lock_bh(&nf_conntrack_lock);
+		if (del_timer(&exp->timeout)) {
+			nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).pid,
+						   nlmsg_report(nlh));
+			nf_ct_expect_put(exp);
+		}
+		spin_unlock_bh(&nf_conntrack_lock);
 		/* have to put what we 'get' above.
 		 * after this line usage count == 0 */
 		nf_ct_expect_put(exp);
@@ -1858,7 +1902,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 				m_help = nfct_help(exp->master);
 				if (!strcmp(m_help->helper->name, name) &&
 				    del_timer(&exp->timeout)) {
-					nf_ct_unlink_expect(exp);
+					nf_ct_unlink_expect_report(exp,
+							NETLINK_CB(skb).pid,
+							nlmsg_report(nlh));
 					nf_ct_expect_put(exp);
 				}
 			}
@@ -1872,7 +1918,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
 						  &net->ct.expect_hash[i],
 						  hnode) {
 				if (del_timer(&exp->timeout)) {
-					nf_ct_unlink_expect(exp);
+					nf_ct_unlink_expect_report(exp,
+							NETLINK_CB(skb).pid,
+							nlmsg_report(nlh));
 					nf_ct_expect_put(exp);
 				}
 			}
@@ -1918,23 +1966,35 @@ ctnetlink_create_expect(struct net *net, u16 zone,
 	if (!h)
 		return -ENOENT;
 	ct = nf_ct_tuplehash_to_ctrack(h);
-	help = nfct_help(ct);
-
-	if (!help || !help->helper) {
-		/* such conntrack hasn't got any helper, abort */
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
 	exp = nf_ct_expect_alloc(ct);
 	if (!exp) {
 		err = -ENOMEM;
 		goto out;
 	}
+	help = nfct_help(ct);
+	if (!help) {
+		if (!cda[CTA_EXPECT_TIMEOUT]) {
+			err = -EINVAL;
+			goto out;
+		}
+		exp->timeout.expires =
+		  jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ;
+
+		exp->flags = NF_CT_EXPECT_USERSPACE;
+		if (cda[CTA_EXPECT_FLAGS]) {
+			exp->flags |=
+				ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+		}
+	} else {
+		if (cda[CTA_EXPECT_FLAGS]) {
+			exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
+			exp->flags &= ~NF_CT_EXPECT_USERSPACE;
+		} else
+			exp->flags = 0;
+	}
 
 	exp->class = 0;
 	exp->expectfn = NULL;
-	exp->flags = 0;
 	exp->master = ct;
 	exp->helper = NULL;
 	memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple));
@@ -2102,6 +2162,7 @@ static void __exit ctnetlink_exit(void)
 {
 	pr_info("ctnetlink: unregistering from nfnetlink.\n");
 
+	nf_ct_remove_userspace_expectations();
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 	nf_ct_expect_unregister_notifier(&ctnl_notifier_exp);
 	nf_conntrack_unregister_notifier(&ctnl_notifier);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 5886ba1d52a..ed6d9295802 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -28,8 +28,8 @@
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_core.h>
 
-static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly;
-struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly;
+static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly;
+struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly;
 EXPORT_SYMBOL_GPL(nf_ct_l3protos);
 
 static DEFINE_MUTEX(nf_ct_proto_mutex);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 9dd8cd4fb6e..3fb2b73b24d 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -329,8 +329,8 @@ static unsigned int get_conntrack_index(const struct tcphdr *tcph)
 /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
    in IP Filter' by Guido van Rooij.
 
-   http://www.nluug.nl/events/sane2000/papers.html
-   http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
+   http://www.sane.nl/events/sane2000/papers.html
+   http://www.darkart.com/mirrors/www.obfuscation.org/ipf/
 
    The boundaries and the conditions are changed according to RFC793:
    the packet must intersect the window (i.e. segments may be
@@ -585,8 +585,16 @@ static bool tcp_in_window(const struct nf_conn *ct,
 			 * Let's try to use the data from the packet.
 			 */
 			sender->td_end = end;
+			win <<= sender->td_scale;
 			sender->td_maxwin = (win == 0 ? 1 : win);
 			sender->td_maxend = end + sender->td_maxwin;
+			/*
+			 * We haven't seen traffic in the other direction yet
+			 * but we have to tweak window tracking to pass III
+			 * and IV until that happens.
+			 */
+			if (receiver->td_maxwin == 0)
+				receiver->td_end = receiver->td_maxend = sack;
 		}
 	} else if (((state->state == TCP_CONNTRACK_SYN_SENT
 		     && dir == IP_CT_DIR_ORIGINAL)
@@ -680,7 +688,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
 		/*
 		 * Update receiver data.
 		 */
-		if (after(end, sender->td_maxend))
+		if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
 			receiver->td_maxwin += end - sender->td_maxend;
 		if (after(sack + win, receiver->td_maxend - 1)) {
 			receiver->td_maxend = sack + win;
@@ -736,27 +744,19 @@ static bool tcp_in_window(const struct nf_conn *ct,
 	return res;
 }
 
-#define	TH_FIN	0x01
-#define	TH_SYN	0x02
-#define	TH_RST	0x04
-#define	TH_PUSH	0x08
-#define	TH_ACK	0x10
-#define	TH_URG	0x20
-#define	TH_ECE	0x40
-#define	TH_CWR	0x80
-
 /* table of valid flag combinations - PUSH, ECE and CWR are always valid */
-static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] =
+static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
+				 TCPHDR_URG) + 1] =
 {
-	[TH_SYN]			= 1,
-	[TH_SYN|TH_URG]			= 1,
-	[TH_SYN|TH_ACK]			= 1,
-	[TH_RST]			= 1,
-	[TH_RST|TH_ACK]			= 1,
-	[TH_FIN|TH_ACK]			= 1,
-	[TH_FIN|TH_ACK|TH_URG]		= 1,
-	[TH_ACK]			= 1,
-	[TH_ACK|TH_URG]			= 1,
+	[TCPHDR_SYN]				= 1,
+	[TCPHDR_SYN|TCPHDR_URG]			= 1,
+	[TCPHDR_SYN|TCPHDR_ACK]			= 1,
+	[TCPHDR_RST]				= 1,
+	[TCPHDR_RST|TCPHDR_ACK]			= 1,
+	[TCPHDR_FIN|TCPHDR_ACK]			= 1,
+	[TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG]	= 1,
+	[TCPHDR_ACK]				= 1,
+	[TCPHDR_ACK|TCPHDR_URG]			= 1,
 };
 
 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
@@ -803,7 +803,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
 	}
 
 	/* Check TCP flags. */
-	tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR|TH_PUSH));
+	tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
 	if (!tcp_valid_flags[tcpflags]) {
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 53d892210a0..bcf47eb518e 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -130,6 +130,44 @@ static int digits_len(const struct nf_conn *ct, const char *dptr,
 	return len;
 }
 
+static int iswordc(const char c)
+{
+	if (isalnum(c) || c == '!' || c == '"' || c == '%' ||
+	    (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' ||
+	    c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' ||
+	    c == '{' || c == '}' || c == '~')
+		return 1;
+	return 0;
+}
+
+static int word_len(const char *dptr, const char *limit)
+{
+	int len = 0;
+	while (dptr < limit && iswordc(*dptr)) {
+		dptr++;
+		len++;
+	}
+	return len;
+}
+
+static int callid_len(const struct nf_conn *ct, const char *dptr,
+		      const char *limit, int *shift)
+{
+	int len, domain_len;
+
+	len = word_len(dptr, limit);
+	dptr += len;
+	if (!len || dptr == limit || *dptr != '@')
+		return len;
+	dptr++;
+	len++;
+
+	domain_len = word_len(dptr, limit);
+	if (!domain_len)
+		return 0;
+	return len + domain_len;
+}
+
 /* get media type + port length */
 static int media_len(const struct nf_conn *ct, const char *dptr,
 		     const char *limit, int *shift)
@@ -152,6 +190,9 @@ static int parse_addr(const struct nf_conn *ct, const char *cp,
 	const char *end;
 	int ret = 0;
 
+	if (!ct)
+		return 0;
+
 	memset(addr, 0, sizeof(*addr));
 	switch (nf_ct_l3num(ct)) {
 	case AF_INET:
@@ -296,6 +337,7 @@ static const struct sip_header ct_sip_hdrs[] = {
 	[SIP_HDR_VIA_TCP]		= SIP_HDR("Via", "v", "TCP ", epaddr_len),
 	[SIP_HDR_EXPIRES]		= SIP_HDR("Expires", NULL, NULL, digits_len),
 	[SIP_HDR_CONTENT_LENGTH]	= SIP_HDR("Content-Length", "l", NULL, digits_len),
+	[SIP_HDR_CALL_ID]		= SIP_HDR("Call-Id", "i", NULL, callid_len),
 };
 
 static const char *sip_follow_continuation(const char *dptr, const char *limit)
@@ -1376,7 +1418,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
 	unsigned int msglen, origlen;
 	const char *dptr, *end;
 	s16 diff, tdiff = 0;
-	int ret;
+	int ret = NF_ACCEPT;
 	typeof(nf_nat_sip_seq_adjust_hook) nf_nat_sip_seq_adjust;
 
 	if (ctinfo != IP_CT_ESTABLISHED &&
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index eb973fcd67a..0fb65705b44 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -15,6 +15,7 @@
 #include <linux/seq_file.h>
 #include <linux/percpu.h>
 #include <linux/netdevice.h>
+#include <linux/security.h>
 #include <net/net_namespace.h>
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
@@ -108,6 +109,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
 	rcu_read_unlock();
 }
 
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+	int ret;
+	u32 len;
+	char *secctx;
+
+	ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+	if (ret)
+		return ret;
+
+	ret = seq_printf(s, "secctx=%s ", secctx);
+
+	security_release_secctx(secctx, len);
+	return ret;
+}
+#else
+static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+	return 0;
+}
+#endif
+
 /* return 0 on success, 1 in case of error */
 static int ct_seq_show(struct seq_file *s, void *v)
 {
@@ -168,10 +192,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
 		goto release;
 #endif
 
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
-	if (seq_printf(s, "secmark=%u ", ct->secmark))
+	if (ct_show_secctx(s, ct))
 		goto release;
-#endif
 
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 	if (seq_printf(s, "zone=%u ", nf_ct_zone(ct)))
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 7df37fd786b..b07393eab88 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -16,7 +16,7 @@
 #define NF_LOG_PREFIXLEN		128
 #define NFLOGGER_NAME_LEN		64
 
-static const struct nf_logger *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
+static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
 static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;
 static DEFINE_MUTEX(nf_log_mutex);
 
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 78b3cf9c519..74aebed5bd2 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -18,7 +18,7 @@
  * long term mutex.  The handler must provide an an outfn() to accept packets
  * for queueing and must reinject all packets it receives, no matter what.
  */
-static const struct nf_queue_handler *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
+static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
 
 static DEFINE_MUTEX(queue_handler_mutex);
 
diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c
index 5490fc37c92..4d87befb04c 100644
--- a/net/netfilter/nf_tproxy_core.c
+++ b/net/netfilter/nf_tproxy_core.c
@@ -18,41 +18,6 @@
 #include <net/udp.h>
 #include <net/netfilter/nf_tproxy_core.h>
 
-struct sock *
-nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
-		      const __be32 saddr, const __be32 daddr,
-		      const __be16 sport, const __be16 dport,
-		      const struct net_device *in, bool listening_only)
-{
-	struct sock *sk;
-
-	/* look up socket */
-	switch (protocol) {
-	case IPPROTO_TCP:
-		if (listening_only)
-			sk = __inet_lookup_listener(net, &tcp_hashinfo,
-						    daddr, ntohs(dport),
-						    in->ifindex);
-		else
-			sk = __inet_lookup(net, &tcp_hashinfo,
-					   saddr, sport, daddr, dport,
-					   in->ifindex);
-		break;
-	case IPPROTO_UDP:
-		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
-				     in->ifindex);
-		break;
-	default:
-		WARN_ON(1);
-		sk = NULL;
-	}
-
-	pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, listener only: %d, sock %p\n",
-		 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), listening_only, sk);
-
-	return sk;
-}
-EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4);
 
 static void
 nf_tproxy_destructor(struct sk_buff *skb)
@@ -70,7 +35,11 @@ nf_tproxy_destructor(struct sk_buff *skb)
 int
 nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
 {
-	if (inet_sk(sk)->transparent) {
+	bool transparent = (sk->sk_state == TCP_TIME_WAIT) ?
+				inet_twsk(sk)->tw_transparent :
+				inet_sk(sk)->transparent;
+
+	if (transparent) {
 		skb_orphan(skb);
 		skb->sk = sk;
 		skb->destructor = nf_tproxy_destructor;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index fc9a211e629..6a1572b0ab4 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -66,9 +66,10 @@ struct nfulnl_instance {
 	u_int16_t group_num;		/* number of this queue */
 	u_int16_t flags;
 	u_int8_t copy_mode;
+	struct rcu_head rcu;
 };
 
-static DEFINE_RWLOCK(instances_lock);
+static DEFINE_SPINLOCK(instances_lock);
 static atomic_t global_seq;
 
 #define INSTANCE_BUCKETS	16
@@ -88,7 +89,7 @@ __instance_lookup(u_int16_t group_num)
 	struct nfulnl_instance *inst;
 
 	head = &instance_table[instance_hashfn(group_num)];
-	hlist_for_each_entry(inst, pos, head, hlist) {
+	hlist_for_each_entry_rcu(inst, pos, head, hlist) {
 		if (inst->group_num == group_num)
 			return inst;
 	}
@@ -106,22 +107,26 @@ instance_lookup_get(u_int16_t group_num)
 {
 	struct nfulnl_instance *inst;
 
-	read_lock_bh(&instances_lock);
+	rcu_read_lock_bh();
 	inst = __instance_lookup(group_num);
-	if (inst)
-		instance_get(inst);
-	read_unlock_bh(&instances_lock);
+	if (inst && !atomic_inc_not_zero(&inst->use))
+		inst = NULL;
+	rcu_read_unlock_bh();
 
 	return inst;
 }
 
+static void nfulnl_instance_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct nfulnl_instance, rcu));
+	module_put(THIS_MODULE);
+}
+
 static void
 instance_put(struct nfulnl_instance *inst)
 {
-	if (inst && atomic_dec_and_test(&inst->use)) {
-		kfree(inst);
-		module_put(THIS_MODULE);
-	}
+	if (inst && atomic_dec_and_test(&inst->use))
+		call_rcu_bh(&inst->rcu, nfulnl_instance_free_rcu);
 }
 
 static void nfulnl_timer(unsigned long data);
@@ -132,7 +137,7 @@ instance_create(u_int16_t group_num, int pid)
 	struct nfulnl_instance *inst;
 	int err;
 
-	write_lock_bh(&instances_lock);
+	spin_lock_bh(&instances_lock);
 	if (__instance_lookup(group_num)) {
 		err = -EEXIST;
 		goto out_unlock;
@@ -166,32 +171,37 @@ instance_create(u_int16_t group_num, int pid)
 	inst->copy_mode 	= NFULNL_COPY_PACKET;
 	inst->copy_range 	= NFULNL_COPY_RANGE_MAX;
 
-	hlist_add_head(&inst->hlist,
+	hlist_add_head_rcu(&inst->hlist,
 		       &instance_table[instance_hashfn(group_num)]);
 
-	write_unlock_bh(&instances_lock);
+	spin_unlock_bh(&instances_lock);
 
 	return inst;
 
 out_unlock:
-	write_unlock_bh(&instances_lock);
+	spin_unlock_bh(&instances_lock);
 	return ERR_PTR(err);
 }
 
 static void __nfulnl_flush(struct nfulnl_instance *inst);
 
+/* called with BH disabled */
 static void
 __instance_destroy(struct nfulnl_instance *inst)
 {
 	/* first pull it out of the global list */
-	hlist_del(&inst->hlist);
+	hlist_del_rcu(&inst->hlist);
 
 	/* then flush all pending packets from skb */
 
-	spin_lock_bh(&inst->lock);
+	spin_lock(&inst->lock);
+
+	/* lockless readers wont be able to use us */
+	inst->copy_mode = NFULNL_COPY_DISABLED;
+
 	if (inst->skb)
 		__nfulnl_flush(inst);
-	spin_unlock_bh(&inst->lock);
+	spin_unlock(&inst->lock);
 
 	/* and finally put the refcount */
 	instance_put(inst);
@@ -200,9 +210,9 @@ __instance_destroy(struct nfulnl_instance *inst)
 static inline void
 instance_destroy(struct nfulnl_instance *inst)
 {
-	write_lock_bh(&instances_lock);
+	spin_lock_bh(&instances_lock);
 	__instance_destroy(inst);
-	write_unlock_bh(&instances_lock);
+	spin_unlock_bh(&instances_lock);
 }
 
 static int
@@ -403,8 +413,9 @@ __build_packet_message(struct nfulnl_instance *inst,
 			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
 				     htonl(indev->ifindex));
 			/* this is the bridge group "brX" */
+			/* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
 			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV,
-				     htonl(indev->br_port->br->dev->ifindex));
+				     htonl(br_port_get_rcu(indev)->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is bridge group, we need to look for
 			 * physical device (when called from ipv4) */
@@ -430,8 +441,9 @@ __build_packet_message(struct nfulnl_instance *inst,
 			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV,
 				     htonl(outdev->ifindex));
 			/* this is the bridge group "brX" */
+			/* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */
 			NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV,
-				     htonl(outdev->br_port->br->dev->ifindex));
+				     htonl(br_port_get_rcu(outdev)->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is a bridge group, we need to look
 			 * for physical device (when called from ipv4) */
@@ -619,6 +631,7 @@ nfulnl_log_packet(u_int8_t pf,
 		size += nla_total_size(data_len);
 		break;
 
+	case NFULNL_COPY_DISABLED:
 	default:
 		goto unlock_and_release;
 	}
@@ -672,7 +685,7 @@ nfulnl_rcv_nl_event(struct notifier_block *this,
 		int i;
 
 		/* destroy all instances for this pid */
-		write_lock_bh(&instances_lock);
+		spin_lock_bh(&instances_lock);
 		for  (i = 0; i < INSTANCE_BUCKETS; i++) {
 			struct hlist_node *tmp, *t2;
 			struct nfulnl_instance *inst;
@@ -684,7 +697,7 @@ nfulnl_rcv_nl_event(struct notifier_block *this,
 					__instance_destroy(inst);
 			}
 		}
-		write_unlock_bh(&instances_lock);
+		spin_unlock_bh(&instances_lock);
 	}
 	return NOTIFY_DONE;
 }
@@ -861,19 +874,19 @@ static struct hlist_node *get_first(struct iter_state *st)
 
 	for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
 		if (!hlist_empty(&instance_table[st->bucket]))
-			return instance_table[st->bucket].first;
+			return rcu_dereference_bh(instance_table[st->bucket].first);
 	}
 	return NULL;
 }
 
 static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h)
 {
-	h = h->next;
+	h = rcu_dereference_bh(h->next);
 	while (!h) {
 		if (++st->bucket >= INSTANCE_BUCKETS)
 			return NULL;
 
-		h = instance_table[st->bucket].first;
+		h = rcu_dereference_bh(instance_table[st->bucket].first);
 	}
 	return h;
 }
@@ -890,9 +903,9 @@ static struct hlist_node *get_idx(struct iter_state *st, loff_t pos)
 }
 
 static void *seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(instances_lock)
+	__acquires(rcu_bh)
 {
-	read_lock_bh(&instances_lock);
+	rcu_read_lock_bh();
 	return get_idx(seq->private, *pos);
 }
 
@@ -903,9 +916,9 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
 }
 
 static void seq_stop(struct seq_file *s, void *v)
-	__releases(instances_lock)
+	__releases(rcu_bh)
 {
-	read_unlock_bh(&instances_lock);
+	rcu_read_unlock_bh();
 }
 
 static int seq_show(struct seq_file *s, void *v)
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 12e1ab37fcd..68e67d19724 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -46,17 +46,19 @@ struct nfqnl_instance {
 	int peer_pid;
 	unsigned int queue_maxlen;
 	unsigned int copy_range;
-	unsigned int queue_total;
 	unsigned int queue_dropped;
 	unsigned int queue_user_dropped;
 
-	unsigned int id_sequence;		/* 'sequence' of pkt ids */
 
 	u_int16_t queue_num;			/* number of this queue */
 	u_int8_t copy_mode;
-
-	spinlock_t lock;
-
+/*
+ * Following fields are dirtied for each queued packet,
+ * keep them in same cache line if possible.
+ */
+	spinlock_t	lock;
+	unsigned int	queue_total;
+	atomic_t	id_sequence;		/* 'sequence' of pkt ids */
 	struct list_head queue_list;		/* packets in queue */
 };
 
@@ -238,32 +240,24 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 
 	outdev = entry->outdev;
 
-	spin_lock_bh(&queue->lock);
-
-	switch ((enum nfqnl_config_mode)queue->copy_mode) {
+	switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) {
 	case NFQNL_COPY_META:
 	case NFQNL_COPY_NONE:
 		break;
 
 	case NFQNL_COPY_PACKET:
 		if (entskb->ip_summed == CHECKSUM_PARTIAL &&
-		    skb_checksum_help(entskb)) {
-			spin_unlock_bh(&queue->lock);
+		    skb_checksum_help(entskb))
 			return NULL;
-		}
-		if (queue->copy_range == 0
-		    || queue->copy_range > entskb->len)
+
+		data_len = ACCESS_ONCE(queue->copy_range);
+		if (data_len == 0 || data_len > entskb->len)
 			data_len = entskb->len;
-		else
-			data_len = queue->copy_range;
 
 		size += nla_total_size(data_len);
 		break;
 	}
 
-	entry->id = queue->id_sequence++;
-
-	spin_unlock_bh(&queue->lock);
 
 	skb = alloc_skb(size, GFP_ATOMIC);
 	if (!skb)
@@ -278,6 +272,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 	nfmsg->version = NFNETLINK_V0;
 	nfmsg->res_id = htons(queue->queue_num);
 
+	entry->id = atomic_inc_return(&queue->id_sequence);
 	pmsg.packet_id 		= htonl(entry->id);
 	pmsg.hw_protocol	= entskb->protocol;
 	pmsg.hook		= entry->hook;
@@ -296,8 +291,9 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 			NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV,
 				     htonl(indev->ifindex));
 			/* this is the bridge group "brX" */
+			/* rcu_read_lock()ed by __nf_queue */
 			NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV,
-				     htonl(indev->br_port->br->dev->ifindex));
+				     htonl(br_port_get_rcu(indev)->br->dev->ifindex));
 		} else {
 			/* Case 2: indev is bridge group, we need to look for
 			 * physical device (when called from ipv4) */
@@ -321,8 +317,9 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 			NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV,
 				     htonl(outdev->ifindex));
 			/* this is the bridge group "brX" */
+			/* rcu_read_lock()ed by __nf_queue */
 			NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV,
-				     htonl(outdev->br_port->br->dev->ifindex));
+				     htonl(br_port_get_rcu(outdev)->br->dev->ifindex));
 		} else {
 			/* Case 2: outdev is bridge group, we need to look for
 			 * physical output device (when called from ipv4) */
@@ -866,7 +863,7 @@ static int seq_show(struct seq_file *s, void *v)
 			  inst->peer_pid, inst->queue_total,
 			  inst->copy_mode, inst->copy_range,
 			  inst->queue_dropped, inst->queue_user_dropped,
-			  inst->id_sequence, 1);
+			  atomic_read(&inst->id_sequence), 1);
 }
 
 static const struct seq_operations nfqnl_seq_ops = {
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index e34622fa000..80463507420 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -116,10 +116,8 @@ EXPORT_SYMBOL(xt_register_targets);
 void
 xt_unregister_targets(struct xt_target *target, unsigned int n)
 {
-	unsigned int i;
-
-	for (i = 0; i < n; i++)
-		xt_unregister_target(&target[i]);
+	while (n-- > 0)
+		xt_unregister_target(&target[n]);
 }
 EXPORT_SYMBOL(xt_unregister_targets);
 
@@ -174,10 +172,8 @@ EXPORT_SYMBOL(xt_register_matches);
 void
 xt_unregister_matches(struct xt_match *match, unsigned int n)
 {
-	unsigned int i;
-
-	for (i = 0; i < n; i++)
-		xt_unregister_match(&match[i]);
+	while (n-- > 0)
+		xt_unregister_match(&match[n]);
 }
 EXPORT_SYMBOL(xt_unregister_matches);
 
diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c
new file mode 100644
index 00000000000..0f642ef8cd2
--- /dev/null
+++ b/net/netfilter/xt_CHECKSUM.c
@@ -0,0 +1,70 @@
+/* iptables module for the packet checksum mangling
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ * (C) 2010 Red Hat, Inc.
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_CHECKSUM.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michael S. Tsirkin <mst@redhat.com>");
+MODULE_DESCRIPTION("Xtables: checksum modification");
+MODULE_ALIAS("ipt_CHECKSUM");
+MODULE_ALIAS("ip6t_CHECKSUM");
+
+static unsigned int
+checksum_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		skb_checksum_help(skb);
+
+	return XT_CONTINUE;
+}
+
+static int checksum_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct xt_CHECKSUM_info *einfo = par->targinfo;
+
+	if (einfo->operation & ~XT_CHECKSUM_OP_FILL) {
+		pr_info("unsupported CHECKSUM operation %x\n", einfo->operation);
+		return -EINVAL;
+	}
+	if (!einfo->operation) {
+		pr_info("no CHECKSUM operation enabled\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct xt_target checksum_tg_reg __read_mostly = {
+	.name		= "CHECKSUM",
+	.family		= NFPROTO_UNSPEC,
+	.target		= checksum_tg,
+	.targetsize	= sizeof(struct xt_CHECKSUM_info),
+	.table		= "mangle",
+	.checkentry	= checksum_tg_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init checksum_tg_init(void)
+{
+	return xt_register_target(&checksum_tg_reg);
+}
+
+static void __exit checksum_tg_exit(void)
+{
+	xt_unregister_target(&checksum_tg_reg);
+}
+
+module_init(checksum_tg_init);
+module_exit(checksum_tg_exit);
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 562bf3266e0..782e51986a6 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -9,7 +9,6 @@
 #include <linux/module.h>
 #include <linux/gfp.h>
 #include <linux/skbuff.h>
-#include <linux/selinux.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <linux/netfilter/x_tables.h>
@@ -67,7 +66,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par)
 		return -EINVAL;
 
 	if (info->flags & XT_CT_NOTRACK) {
-		ct = &nf_conntrack_untracked;
+		ct = nf_ct_untracked_get();
 		atomic_inc(&ct->ct_general.use);
 		goto out;
 	}
@@ -132,7 +131,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par)
 	struct nf_conn *ct = info->ct;
 	struct nf_conn_help *help;
 
-	if (ct != &nf_conntrack_untracked) {
+	if (!nf_ct_is_untracked(ct)) {
 		help = nfct_help(ct);
 		if (help)
 			module_put(help->helper->me);
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
new file mode 100644
index 00000000000..be1f22e1354
--- /dev/null
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -0,0 +1,315 @@
+/*
+ * linux/net/netfilter/xt_IDLETIMER.c
+ *
+ * Netfilter module to trigger a timer when packet matches.
+ * After timer expires a kevent will be sent.
+ *
+ * Copyright (C) 2004, 2010 Nokia Corporation
+ * Written by Timo Teras <ext-timo.teras@nokia.com>
+ *
+ * Converted to x_tables and reworked for upstream inclusion
+ * by Luciano Coelho <luciano.coelho@nokia.com>
+ *
+ * Contact: Luciano Coelho <luciano.coelho@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_IDLETIMER.h>
+#include <linux/kdev_t.h>
+#include <linux/kobject.h>
+#include <linux/workqueue.h>
+#include <linux/sysfs.h>
+
+struct idletimer_tg_attr {
+	struct attribute attr;
+	ssize_t	(*show)(struct kobject *kobj,
+			struct attribute *attr, char *buf);
+};
+
+struct idletimer_tg {
+	struct list_head entry;
+	struct timer_list timer;
+	struct work_struct work;
+
+	struct kobject *kobj;
+	struct idletimer_tg_attr attr;
+
+	unsigned int refcnt;
+};
+
+static LIST_HEAD(idletimer_tg_list);
+static DEFINE_MUTEX(list_mutex);
+
+static struct kobject *idletimer_tg_kobj;
+
+static
+struct idletimer_tg *__idletimer_tg_find_by_label(const char *label)
+{
+	struct idletimer_tg *entry;
+
+	BUG_ON(!label);
+
+	list_for_each_entry(entry, &idletimer_tg_list, entry) {
+		if (!strcmp(label, entry->attr.attr.name))
+			return entry;
+	}
+
+	return NULL;
+}
+
+static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr,
+				 char *buf)
+{
+	struct idletimer_tg *timer;
+	unsigned long expires = 0;
+
+	mutex_lock(&list_mutex);
+
+	timer =	__idletimer_tg_find_by_label(attr->name);
+	if (timer)
+		expires = timer->timer.expires;
+
+	mutex_unlock(&list_mutex);
+
+	if (time_after(expires, jiffies))
+		return sprintf(buf, "%u\n",
+			       jiffies_to_msecs(expires - jiffies) / 1000);
+
+	return sprintf(buf, "0\n");
+}
+
+static void idletimer_tg_work(struct work_struct *work)
+{
+	struct idletimer_tg *timer = container_of(work, struct idletimer_tg,
+						  work);
+
+	sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name);
+}
+
+static void idletimer_tg_expired(unsigned long data)
+{
+	struct idletimer_tg *timer = (struct idletimer_tg *) data;
+
+	pr_debug("timer %s expired\n", timer->attr.attr.name);
+
+	schedule_work(&timer->work);
+}
+
+static int idletimer_tg_create(struct idletimer_tg_info *info)
+{
+	int ret;
+
+	info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL);
+	if (!info->timer) {
+		pr_debug("couldn't alloc timer\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);
+	if (!info->timer->attr.attr.name) {
+		pr_debug("couldn't alloc attribute name\n");
+		ret = -ENOMEM;
+		goto out_free_timer;
+	}
+	info->timer->attr.attr.mode = S_IRUGO;
+	info->timer->attr.show = idletimer_tg_show;
+
+	ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr);
+	if (ret < 0) {
+		pr_debug("couldn't add file to sysfs");
+		goto out_free_attr;
+	}
+
+	list_add(&info->timer->entry, &idletimer_tg_list);
+
+	setup_timer(&info->timer->timer, idletimer_tg_expired,
+		    (unsigned long) info->timer);
+	info->timer->refcnt = 1;
+
+	mod_timer(&info->timer->timer,
+		  msecs_to_jiffies(info->timeout * 1000) + jiffies);
+
+	INIT_WORK(&info->timer->work, idletimer_tg_work);
+
+	return 0;
+
+out_free_attr:
+	kfree(info->timer->attr.attr.name);
+out_free_timer:
+	kfree(info->timer);
+out:
+	return ret;
+}
+
+/*
+ * The actual xt_tables plugin.
+ */
+static unsigned int idletimer_tg_target(struct sk_buff *skb,
+					 const struct xt_action_param *par)
+{
+	const struct idletimer_tg_info *info = par->targinfo;
+
+	pr_debug("resetting timer %s, timeout period %u\n",
+		 info->label, info->timeout);
+
+	BUG_ON(!info->timer);
+
+	mod_timer(&info->timer->timer,
+		  msecs_to_jiffies(info->timeout * 1000) + jiffies);
+
+	return XT_CONTINUE;
+}
+
+static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
+{
+	struct idletimer_tg_info *info = par->targinfo;
+	int ret;
+
+	pr_debug("checkentry targinfo%s\n", info->label);
+
+	if (info->timeout == 0) {
+		pr_debug("timeout value is zero\n");
+		return -EINVAL;
+	}
+
+	if (info->label[0] == '\0' ||
+	    strnlen(info->label,
+		    MAX_IDLETIMER_LABEL_SIZE) == MAX_IDLETIMER_LABEL_SIZE) {
+		pr_debug("label is empty or not nul-terminated\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&list_mutex);
+
+	info->timer = __idletimer_tg_find_by_label(info->label);
+	if (info->timer) {
+		info->timer->refcnt++;
+		mod_timer(&info->timer->timer,
+			  msecs_to_jiffies(info->timeout * 1000) + jiffies);
+
+		pr_debug("increased refcnt of timer %s to %u\n",
+			 info->label, info->timer->refcnt);
+	} else {
+		ret = idletimer_tg_create(info);
+		if (ret < 0) {
+			pr_debug("failed to create timer\n");
+			mutex_unlock(&list_mutex);
+			return ret;
+		}
+	}
+
+	mutex_unlock(&list_mutex);
+	return 0;
+}
+
+static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	const struct idletimer_tg_info *info = par->targinfo;
+
+	pr_debug("destroy targinfo %s\n", info->label);
+
+	mutex_lock(&list_mutex);
+
+	if (--info->timer->refcnt == 0) {
+		pr_debug("deleting timer %s\n", info->label);
+
+		list_del(&info->timer->entry);
+		del_timer_sync(&info->timer->timer);
+		sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+		kfree(info->timer->attr.attr.name);
+		kfree(info->timer);
+	} else {
+		pr_debug("decreased refcnt of timer %s to %u\n",
+			 info->label, info->timer->refcnt);
+	}
+
+	mutex_unlock(&list_mutex);
+}
+
+static struct xt_target idletimer_tg __read_mostly = {
+	.name		= "IDLETIMER",
+	.family		= NFPROTO_UNSPEC,
+	.target		= idletimer_tg_target,
+	.targetsize     = sizeof(struct idletimer_tg_info),
+	.checkentry	= idletimer_tg_checkentry,
+	.destroy        = idletimer_tg_destroy,
+	.me		= THIS_MODULE,
+};
+
+static struct class *idletimer_tg_class;
+
+static struct device *idletimer_tg_device;
+
+static int __init idletimer_tg_init(void)
+{
+	int err;
+
+	idletimer_tg_class = class_create(THIS_MODULE, "xt_idletimer");
+	err = PTR_ERR(idletimer_tg_class);
+	if (IS_ERR(idletimer_tg_class)) {
+		pr_debug("couldn't register device class\n");
+		goto out;
+	}
+
+	idletimer_tg_device = device_create(idletimer_tg_class, NULL,
+					    MKDEV(0, 0), NULL, "timers");
+	err = PTR_ERR(idletimer_tg_device);
+	if (IS_ERR(idletimer_tg_device)) {
+		pr_debug("couldn't register system device\n");
+		goto out_class;
+	}
+
+	idletimer_tg_kobj = &idletimer_tg_device->kobj;
+
+	err =  xt_register_target(&idletimer_tg);
+	if (err < 0) {
+		pr_debug("couldn't register xt target\n");
+		goto out_dev;
+	}
+
+	return 0;
+out_dev:
+	device_destroy(idletimer_tg_class, MKDEV(0, 0));
+out_class:
+	class_destroy(idletimer_tg_class);
+out:
+	return err;
+}
+
+static void __exit idletimer_tg_exit(void)
+{
+	xt_unregister_target(&idletimer_tg);
+
+	device_destroy(idletimer_tg_class, MKDEV(0, 0));
+	class_destroy(idletimer_tg_class);
+}
+
+module_init(idletimer_tg_init);
+module_exit(idletimer_tg_exit);
+
+MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>");
+MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>");
+MODULE_DESCRIPTION("Xtables: idle time monitor");
+MODULE_LICENSE("GPL v2");
diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c
index 512b9123252..9d782181b6c 100644
--- a/net/netfilter/xt_NOTRACK.c
+++ b/net/netfilter/xt_NOTRACK.c
@@ -23,7 +23,7 @@ notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	   If there is a real ct entry correspondig to this packet,
 	   it'll hang aroun till timing out. We don't deal with it
 	   for performance reasons. JK */
-	skb->nfct = &nf_conntrack_untracked.ct_general;
+	skb->nfct = &nf_ct_untracked_get()->ct_general;
 	skb->nfctinfo = IP_CT_NEW;
 	nf_conntrack_get(skb->nfct);
 
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 69c01e10f8a..de079abd5bc 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -60,13 +60,22 @@ struct xt_rateest *xt_rateest_lookup(const char *name)
 }
 EXPORT_SYMBOL_GPL(xt_rateest_lookup);
 
+static void xt_rateest_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct xt_rateest, rcu));
+}
+
 void xt_rateest_put(struct xt_rateest *est)
 {
 	mutex_lock(&xt_rateest_mutex);
 	if (--est->refcnt == 0) {
 		hlist_del(&est->list);
 		gen_kill_estimator(&est->bstats, &est->rstats);
-		kfree(est);
+		/*
+		 * gen_estimator est_timer() might access est->lock or bstats,
+		 * wait a RCU grace period before freeing 'est'
+		 */
+		call_rcu(&est->rcu, xt_rateest_free_rcu);
 	}
 	mutex_unlock(&xt_rateest_mutex);
 }
@@ -179,6 +188,7 @@ static int __init xt_rateest_tg_init(void)
 static void __exit xt_rateest_tg_fini(void)
 {
 	xt_unregister_target(&xt_rateest_tg_reg);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s (xt_rateest_free_rcu) */
 }
 
 
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 23b2d6c486b..9faf5e050b7 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -14,8 +14,8 @@
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
+#include <linux/security.h>
 #include <linux/skbuff.h>
-#include <linux/selinux.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_SECMARK.h>
 
@@ -39,9 +39,8 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 
 	switch (mode) {
 	case SECMARK_MODE_SEL:
-		secmark = info->u.sel.selsid;
+		secmark = info->secid;
 		break;
-
 	default:
 		BUG();
 	}
@@ -50,33 +49,33 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-static int checkentry_selinux(struct xt_secmark_target_info *info)
+static int checkentry_lsm(struct xt_secmark_target_info *info)
 {
 	int err;
-	struct xt_secmark_target_selinux_info *sel = &info->u.sel;
 
-	sel->selctx[SECMARK_SELCTX_MAX - 1] = '\0';
+	info->secctx[SECMARK_SECCTX_MAX - 1] = '\0';
+	info->secid = 0;
 
-	err = selinux_string_to_sid(sel->selctx, &sel->selsid);
+	err = security_secctx_to_secid(info->secctx, strlen(info->secctx),
+				       &info->secid);
 	if (err) {
 		if (err == -EINVAL)
-			pr_info("invalid SELinux context \'%s\'\n",
-				sel->selctx);
+			pr_info("invalid security context \'%s\'\n", info->secctx);
 		return err;
 	}
 
-	if (!sel->selsid) {
-		pr_info("unable to map SELinux context \'%s\'\n", sel->selctx);
+	if (!info->secid) {
+		pr_info("unable to map security context \'%s\'\n", info->secctx);
 		return -ENOENT;
 	}
 
-	err = selinux_secmark_relabel_packet_permission(sel->selsid);
+	err = security_secmark_relabel_packet(info->secid);
 	if (err) {
 		pr_info("unable to obtain relabeling permission\n");
 		return err;
 	}
 
-	selinux_secmark_refcount_inc();
+	security_secmark_refcount_inc();
 	return 0;
 }
 
@@ -100,16 +99,16 @@ static int secmark_tg_check(const struct xt_tgchk_param *par)
 
 	switch (info->mode) {
 	case SECMARK_MODE_SEL:
-		err = checkentry_selinux(info);
-		if (err <= 0)
-			return err;
 		break;
-
 	default:
 		pr_info("invalid mode: %hu\n", info->mode);
 		return -EINVAL;
 	}
 
+	err = checkentry_lsm(info);
+	if (err)
+		return err;
+
 	if (!mode)
 		mode = info->mode;
 	return 0;
@@ -119,7 +118,7 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
 {
 	switch (mode) {
 	case SECMARK_MODE_SEL:
-		selinux_secmark_refcount_dec();
+		security_secmark_refcount_dec();
 	}
 }
 
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 62ec021fbd5..eb81c380da1 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -165,8 +165,8 @@ static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb,
 	rcu_read_unlock();
 
 	if (rt != NULL) {
-		mtu = dst_mtu(&rt->u.dst);
-		dst_release(&rt->u.dst);
+		mtu = dst_mtu(&rt->dst);
+		dst_release(&rt->dst);
 	}
 	return mtu;
 }
@@ -220,15 +220,13 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 }
 #endif
 
-#define TH_SYN 0x02
-
 /* Must specify -p tcp --syn */
 static inline bool find_syn_match(const struct xt_entry_match *m)
 {
 	const struct xt_tcp *tcpinfo = (const struct xt_tcp *)m->data;
 
 	if (strcmp(m->u.kernel.match->name, "tcp") == 0 &&
-	    tcpinfo->flg_cmp & TH_SYN &&
+	    tcpinfo->flg_cmp & TCPHDR_SYN &&
 	    !(tcpinfo->invflags & XT_TCP_INV_FLAGS))
 		return true;
 
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 859d9fd429c..22a2d421e7e 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -77,8 +77,8 @@ tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)
 		return false;
 
 	skb_dst_drop(skb);
-	skb_dst_set(skb, &rt->u.dst);
-	skb->dev      = rt->u.dst.dev;
+	skb_dst_set(skb, &rt->dst);
+	skb->dev      = rt->dst.dev;
 	skb->protocol = htons(ETH_P_IP);
 	return true;
 }
@@ -104,7 +104,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 #ifdef WITH_CONNTRACK
 	/* Avoid counting cloned packets towards the original connection. */
 	nf_conntrack_put(skb->nfct);
-	skb->nfct     = &nf_conntrack_untracked.ct_general;
+	skb->nfct     = &nf_ct_untracked_get()->ct_general;
 	skb->nfctinfo = IP_CT_NEW;
 	nf_conntrack_get(skb->nfct);
 #endif
@@ -177,7 +177,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 
 #ifdef WITH_CONNTRACK
 	nf_conntrack_put(skb->nfct);
-	skb->nfct     = &nf_conntrack_untracked.ct_general;
+	skb->nfct     = &nf_ct_untracked_get()->ct_general;
 	skb->nfctinfo = IP_CT_NEW;
 	nf_conntrack_get(skb->nfct);
 #endif
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index e1a0dedac25..640678f47a2 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -1,7 +1,7 @@
 /*
  * Transparent proxy support for Linux/iptables
  *
- * Copyright (c) 2006-2007 BalaBit IT Ltd.
+ * Copyright (c) 2006-2010 BalaBit IT Ltd.
  * Author: Balazs Scheidler, Krisztian Kovacs
  *
  * This program is free software; you can redistribute it and/or modify
@@ -16,19 +16,98 @@
 #include <net/checksum.h>
 #include <net/udp.h>
 #include <net/inet_sock.h>
-
+#include <linux/inetdevice.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter/xt_TPROXY.h>
 
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#define XT_TPROXY_HAVE_IPV6 1
+#include <net/if_inet6.h>
+#include <net/addrconf.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
 #include <net/netfilter/nf_tproxy_core.h>
+#include <linux/netfilter/xt_TPROXY.h>
+
+static inline __be32
+tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
+{
+	struct in_device *indev;
+	__be32 laddr;
+
+	if (user_laddr)
+		return user_laddr;
+
+	laddr = 0;
+	rcu_read_lock();
+	indev = __in_dev_get_rcu(skb->dev);
+	for_primary_ifa(indev) {
+		laddr = ifa->ifa_local;
+		break;
+	} endfor_ifa(indev);
+	rcu_read_unlock();
+
+	return laddr ? laddr : daddr;
+}
+
+/**
+ * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections
+ * @skb:	The skb being processed.
+ * @laddr:	IPv4 address to redirect to or zero.
+ * @lport:	TCP port to redirect to or zero.
+ * @sk:		The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait4() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+			struct sock *sk)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct tcphdr _hdr, *hp;
+
+	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		inet_twsk_put(inet_twsk(sk));
+		return NULL;
+	}
+
+	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		/* SYN to a TIME_WAIT socket, we'd rather redirect it
+		 * to a listener socket if there's one */
+		struct sock *sk2;
+
+		sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+					    iph->saddr, laddr ? laddr : iph->daddr,
+					    hp->source, lport ? lport : hp->dest,
+					    skb->dev, NFT_LOOKUP_LISTENER);
+		if (sk2) {
+			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+			inet_twsk_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+
+	return sk;
+}
 
 static unsigned int
-tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
+tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+	   u_int32_t mark_mask, u_int32_t mark_value)
 {
 	const struct iphdr *iph = ip_hdr(skb);
-	const struct xt_tproxy_target_info *tgi = par->targinfo;
 	struct udphdr _hdr, *hp;
 	struct sock *sk;
 
@@ -36,10 +115,195 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	if (hp == NULL)
 		return NF_DROP;
 
+	/* check if there's an ongoing connection on the packet
+	 * addresses, this happens if the redirect already happened
+	 * and the current packet belongs to an already established
+	 * connection */
 	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
-				   iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr,
-				   hp->source, tgi->lport ? tgi->lport : hp->dest,
-				   par->in, true);
+				   iph->saddr, iph->daddr,
+				   hp->source, hp->dest,
+				   skb->dev, NFT_LOOKUP_ESTABLISHED);
+
+	laddr = tproxy_laddr4(skb, laddr, iph->daddr);
+	if (!lport)
+		lport = hp->dest;
+
+	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
+	if (sk && sk->sk_state == TCP_TIME_WAIT)
+		/* reopening a TIME_WAIT connection needs special handling */
+		sk = tproxy_handle_time_wait4(skb, laddr, lport, sk);
+	else if (!sk)
+		/* no, there's no established connection, check if
+		 * there's a listener on the redirected addr/port */
+		sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
+					   iph->saddr, laddr,
+					   hp->source, lport,
+					   skb->dev, NFT_LOOKUP_LISTENER);
+
+	/* NOTE: assign_sock consumes our sk reference */
+	if (sk && nf_tproxy_assign_sock(skb, sk)) {
+		/* This should be in a separate target, but we don't do multiple
+		   targets on the same rule yet */
+		skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
+
+		pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+			 iph->protocol, &iph->daddr, ntohs(hp->dest),
+			 &laddr, ntohs(lport), skb->mark);
+		return NF_ACCEPT;
+	}
+
+	pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
+		 iph->protocol, &iph->saddr, ntohs(hp->source),
+		 &iph->daddr, ntohs(hp->dest), skb->mark);
+	return NF_DROP;
+}
+
+static unsigned int
+tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_tproxy_target_info *tgi = par->targinfo;
+
+	return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+static unsigned int
+tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+	return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+#ifdef XT_TPROXY_HAVE_IPV6
+
+static inline const struct in6_addr *
+tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
+	      const struct in6_addr *daddr)
+{
+	struct inet6_dev *indev;
+	struct inet6_ifaddr *ifa;
+	struct in6_addr *laddr;
+
+	if (!ipv6_addr_any(user_laddr))
+		return user_laddr;
+	laddr = NULL;
+
+	rcu_read_lock();
+	indev = __in6_dev_get(skb->dev);
+	if (indev)
+		list_for_each_entry(ifa, &indev->addr_list, if_list) {
+			if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
+				continue;
+
+			laddr = &ifa->addr;
+			break;
+		}
+	rcu_read_unlock();
+
+	return laddr ? laddr : daddr;
+}
+
+/**
+ * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
+ * @skb:	The skb being processed.
+ * @tproto:	Transport protocol.
+ * @thoff:	Transport protocol header offset.
+ * @par:	Iptables target parameters.
+ * @sk:		The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait6() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+			 const struct xt_action_param *par,
+			 struct sock *sk)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct tcphdr _hdr, *hp;
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		inet_twsk_put(inet_twsk(sk));
+		return NULL;
+	}
+
+	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		/* SYN to a TIME_WAIT socket, we'd rather redirect it
+		 * to a listener socket if there's one */
+		struct sock *sk2;
+
+		sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+					    &iph->saddr,
+					    tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
+					    hp->source,
+					    tgi->lport ? tgi->lport : hp->dest,
+					    skb->dev, NFT_LOOKUP_LISTENER);
+		if (sk2) {
+			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+			inet_twsk_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+
+	return sk;
+}
+
+static unsigned int
+tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+	struct udphdr _hdr, *hp;
+	struct sock *sk;
+	const struct in6_addr *laddr;
+	__be16 lport;
+	int thoff;
+	int tproto;
+
+	tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+	if (tproto < 0) {
+		pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+		return NF_DROP;
+	}
+
+	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n");
+		return NF_DROP;
+	}
+
+	/* check if there's an ongoing connection on the packet
+	 * addresses, this happens if the redirect already happened
+	 * and the current packet belongs to an already established
+	 * connection */
+	sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+				   &iph->saddr, &iph->daddr,
+				   hp->source, hp->dest,
+				   par->in, NFT_LOOKUP_ESTABLISHED);
+
+	laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
+	lport = tgi->lport ? tgi->lport : hp->dest;
+
+	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
+	if (sk && sk->sk_state == TCP_TIME_WAIT)
+		/* reopening a TIME_WAIT connection needs special handling */
+		sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk);
+	else if (!sk)
+		/* no there's no established connection, check if
+		 * there's a listener on the redirected addr/port */
+		sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+					   &iph->saddr, laddr,
+					   hp->source, lport,
+					   par->in, NFT_LOOKUP_LISTENER);
 
 	/* NOTE: assign_sock consumes our sk reference */
 	if (sk && nf_tproxy_assign_sock(skb, sk)) {
@@ -47,19 +311,34 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
 		   targets on the same rule yet */
 		skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
 
-		pr_debug("redirecting: proto %u %08x:%u -> %08x:%u, mark: %x\n",
-			 iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
-			 ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+		pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+			 tproto, &iph->saddr, ntohs(hp->source),
+			 laddr, ntohs(lport), skb->mark);
 		return NF_ACCEPT;
 	}
 
-	pr_debug("no socket, dropping: proto %u %08x:%u -> %08x:%u, mark: %x\n",
-		 iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
-		 ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+	pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
+		 tproto, &iph->saddr, ntohs(hp->source),
+		 &iph->daddr, ntohs(hp->dest), skb->mark);
+
 	return NF_DROP;
 }
 
-static int tproxy_tg_check(const struct xt_tgchk_param *par)
+static int tproxy_tg6_check(const struct xt_tgchk_param *par)
+{
+	const struct ip6t_ip6 *i = par->entryinfo;
+
+	if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
+	    && !(i->flags & IP6T_INV_PROTO))
+		return 0;
+
+	pr_info("Can be used only in combination with "
+		"either -p tcp or -p udp\n");
+	return -EINVAL;
+}
+#endif
+
+static int tproxy_tg4_check(const struct xt_tgchk_param *par)
 {
 	const struct ipt_ip *i = par->entryinfo;
 
@@ -72,31 +351,64 @@ static int tproxy_tg_check(const struct xt_tgchk_param *par)
 	return -EINVAL;
 }
 
-static struct xt_target tproxy_tg_reg __read_mostly = {
-	.name		= "TPROXY",
-	.family		= AF_INET,
-	.table		= "mangle",
-	.target		= tproxy_tg,
-	.targetsize	= sizeof(struct xt_tproxy_target_info),
-	.checkentry	= tproxy_tg_check,
-	.hooks		= 1 << NF_INET_PRE_ROUTING,
-	.me		= THIS_MODULE,
+static struct xt_target tproxy_tg_reg[] __read_mostly = {
+	{
+		.name		= "TPROXY",
+		.family		= NFPROTO_IPV4,
+		.table		= "mangle",
+		.target		= tproxy_tg4_v0,
+		.revision	= 0,
+		.targetsize	= sizeof(struct xt_tproxy_target_info),
+		.checkentry	= tproxy_tg4_check,
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "TPROXY",
+		.family		= NFPROTO_IPV4,
+		.table		= "mangle",
+		.target		= tproxy_tg4_v1,
+		.revision	= 1,
+		.targetsize	= sizeof(struct xt_tproxy_target_info_v1),
+		.checkentry	= tproxy_tg4_check,
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
+#ifdef XT_TPROXY_HAVE_IPV6
+	{
+		.name		= "TPROXY",
+		.family		= NFPROTO_IPV6,
+		.table		= "mangle",
+		.target		= tproxy_tg6_v1,
+		.revision	= 1,
+		.targetsize	= sizeof(struct xt_tproxy_target_info_v1),
+		.checkentry	= tproxy_tg6_check,
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
+#endif
+
 };
 
 static int __init tproxy_tg_init(void)
 {
 	nf_defrag_ipv4_enable();
-	return xt_register_target(&tproxy_tg_reg);
+#ifdef XT_TPROXY_HAVE_IPV6
+	nf_defrag_ipv6_enable();
+#endif
+
+	return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
 }
 
 static void __exit tproxy_tg_exit(void)
 {
-	xt_unregister_target(&tproxy_tg_reg);
+	xt_unregister_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
 }
 
 module_init(tproxy_tg_init);
 module_exit(tproxy_tg_exit);
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Krisztian Kovacs");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
 MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module.");
 MODULE_ALIAS("ipt_TPROXY");
+MODULE_ALIAS("ip6t_TPROXY");
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
index 30b95a1c1c8..f4af1bfafb1 100644
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -120,7 +120,7 @@ xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	if (ct == NULL)
 		return false;
 
-	if (ct == &nf_conntrack_untracked)
+	if (nf_ct_is_untracked(ct))
 		return false;
 
 	if (ct->master)
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index 73517835303..5b138506690 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -112,6 +112,16 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par)
 	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
+
+	/*
+	 * This filter cannot function correctly unless connection tracking
+	 * accounting is enabled, so complain in the hope that someone notices.
+	 */
+	if (!nf_ct_acct_enabled(par->net)) {
+		pr_warning("Forcing CT accounting to be enabled\n");
+		nf_ct_set_acct(par->net, true);
+	}
+
 	return ret;
 }
 
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 39681f10291..e536710ad91 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -123,11 +123,12 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
 
 	ct = nf_ct_get(skb, &ctinfo);
 
-	if (ct == &nf_conntrack_untracked)
-		statebit = XT_CONNTRACK_STATE_UNTRACKED;
-	else if (ct != NULL)
-		statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
-	else
+	if (ct) {
+		if (nf_ct_is_untracked(ct))
+			statebit = XT_CONNTRACK_STATE_UNTRACKED;
+		else
+			statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
+	} else
 		statebit = XT_CONNTRACK_STATE_INVALID;
 
 	if (info->match_flags & XT_CONNTRACK_STATE) {
diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c
new file mode 100644
index 00000000000..b39db8a5cba
--- /dev/null
+++ b/net/netfilter/xt_cpu.c
@@ -0,0 +1,63 @@
+/* Kernel module to match running CPU */
+
+/*
+ * Might be used to distribute connections on several daemons, if
+ * RPS (Remote Packet Steering) is enabled or NIC is multiqueue capable,
+ * each RX queue IRQ affined to one CPU (1:1 mapping)
+ *
+ */
+
+/* (C) 2010 Eric Dumazet
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/xt_cpu.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>");
+MODULE_DESCRIPTION("Xtables: CPU match");
+
+static int cpu_mt_check(const struct xt_mtchk_param *par)
+{
+	const struct xt_cpu_info *info = par->matchinfo;
+
+	if (info->invert & ~1)
+		return -EINVAL;
+	return 0;
+}
+
+static bool cpu_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_cpu_info *info = par->matchinfo;
+
+	return (info->cpu == smp_processor_id()) ^ info->invert;
+}
+
+static struct xt_match cpu_mt_reg __read_mostly = {
+	.name       = "cpu",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.checkentry = cpu_mt_check,
+	.match      = cpu_mt,
+	.matchsize  = sizeof(struct xt_cpu_info),
+	.me         = THIS_MODULE,
+};
+
+static int __init cpu_mt_init(void)
+{
+	return xt_register_match(&cpu_mt_reg);
+}
+
+static void __exit cpu_mt_exit(void)
+{
+	xt_unregister_match(&cpu_mt_reg);
+}
+
+module_init(cpu_mt_init);
+module_exit(cpu_mt_exit);
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index b46a8390896..9228ee0dc11 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -448,6 +448,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
 {
 	__be16 _ports[2], *ports;
 	u8 nexthdr;
+	int poff;
 
 	memset(dst, 0, sizeof(*dst));
 
@@ -492,19 +493,13 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
 		return 0;
 	}
 
-	switch (nexthdr) {
-	case IPPROTO_TCP:
-	case IPPROTO_UDP:
-	case IPPROTO_UDPLITE:
-	case IPPROTO_SCTP:
-	case IPPROTO_DCCP:
-		ports = skb_header_pointer(skb, protoff, sizeof(_ports),
+	poff = proto_ports_offset(nexthdr);
+	if (poff >= 0) {
+		ports = skb_header_pointer(skb, protoff + poff, sizeof(_ports),
 					   &_ports);
-		break;
-	default:
+	} else {
 		_ports[0] = _ports[1] = 0;
 		ports = _ports;
-		break;
 	}
 	if (!ports)
 		return -1;
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
new file mode 100644
index 00000000000..9127a3d8aa3
--- /dev/null
+++ b/net/netfilter/xt_ipvs.c
@@ -0,0 +1,188 @@
+/*
+ *	xt_ipvs - kernel module to match IPVS connection properties
+ *
+ *	Author: Hannes Eder <heder@google.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#endif
+#include <linux/ip_vs.h>
+#include <linux/types.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_ipvs.h>
+#include <net/netfilter/nf_conntrack.h>
+
+#include <net/ip_vs.h>
+
+MODULE_AUTHOR("Hannes Eder <heder@google.com>");
+MODULE_DESCRIPTION("Xtables: match IPVS connection properties");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_ipvs");
+MODULE_ALIAS("ip6t_ipvs");
+
+/* borrowed from xt_conntrack */
+static bool ipvs_mt_addrcmp(const union nf_inet_addr *kaddr,
+			    const union nf_inet_addr *uaddr,
+			    const union nf_inet_addr *umask,
+			    unsigned int l3proto)
+{
+	if (l3proto == NFPROTO_IPV4)
+		return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0;
+#ifdef CONFIG_IP_VS_IPV6
+	else if (l3proto == NFPROTO_IPV6)
+		return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6,
+		       &uaddr->in6) == 0;
+#endif
+	else
+		return false;
+}
+
+static bool
+ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_ipvs_mtinfo *data = par->matchinfo;
+	/* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */
+	const u_int8_t family = par->family;
+	struct ip_vs_iphdr iph;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_conn *cp;
+	bool match = true;
+
+	if (data->bitmask == XT_IPVS_IPVS_PROPERTY) {
+		match = skb->ipvs_property ^
+			!!(data->invert & XT_IPVS_IPVS_PROPERTY);
+		goto out;
+	}
+
+	/* other flags than XT_IPVS_IPVS_PROPERTY are set */
+	if (!skb->ipvs_property) {
+		match = false;
+		goto out;
+	}
+
+	ip_vs_fill_iphdr(family, skb_network_header(skb), &iph);
+
+	if (data->bitmask & XT_IPVS_PROTO)
+		if ((iph.protocol == data->l4proto) ^
+		    !(data->invert & XT_IPVS_PROTO)) {
+			match = false;
+			goto out;
+		}
+
+	pp = ip_vs_proto_get(iph.protocol);
+	if (unlikely(!pp)) {
+		match = false;
+		goto out;
+	}
+
+	/*
+	 * Check if the packet belongs to an existing entry
+	 */
+	cp = pp->conn_out_get(family, skb, pp, &iph, iph.len, 1 /* inverse */);
+	if (unlikely(cp == NULL)) {
+		match = false;
+		goto out;
+	}
+
+	/*
+	 * We found a connection, i.e. ct != 0, make sure to call
+	 * __ip_vs_conn_put before returning.  In our case jump to out_put_con.
+	 */
+
+	if (data->bitmask & XT_IPVS_VPORT)
+		if ((cp->vport == data->vport) ^
+		    !(data->invert & XT_IPVS_VPORT)) {
+			match = false;
+			goto out_put_cp;
+		}
+
+	if (data->bitmask & XT_IPVS_VPORTCTL)
+		if ((cp->control != NULL &&
+		     cp->control->vport == data->vportctl) ^
+		    !(data->invert & XT_IPVS_VPORTCTL)) {
+			match = false;
+			goto out_put_cp;
+		}
+
+	if (data->bitmask & XT_IPVS_DIR) {
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+		if (ct == NULL || nf_ct_is_untracked(ct)) {
+			match = false;
+			goto out_put_cp;
+		}
+
+		if ((ctinfo >= IP_CT_IS_REPLY) ^
+		    !!(data->invert & XT_IPVS_DIR)) {
+			match = false;
+			goto out_put_cp;
+		}
+	}
+
+	if (data->bitmask & XT_IPVS_METHOD)
+		if (((cp->flags & IP_VS_CONN_F_FWD_MASK) == data->fwd_method) ^
+		    !(data->invert & XT_IPVS_METHOD)) {
+			match = false;
+			goto out_put_cp;
+		}
+
+	if (data->bitmask & XT_IPVS_VADDR) {
+		if (ipvs_mt_addrcmp(&cp->vaddr, &data->vaddr,
+				    &data->vmask, family) ^
+		    !(data->invert & XT_IPVS_VADDR)) {
+			match = false;
+			goto out_put_cp;
+		}
+	}
+
+out_put_cp:
+	__ip_vs_conn_put(cp);
+out:
+	pr_debug("match=%d\n", match);
+	return match;
+}
+
+static int ipvs_mt_check(const struct xt_mtchk_param *par)
+{
+	if (par->family != NFPROTO_IPV4
+#ifdef CONFIG_IP_VS_IPV6
+	    && par->family != NFPROTO_IPV6
+#endif
+		) {
+		pr_info("protocol family %u not supported\n", par->family);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct xt_match xt_ipvs_mt_reg __read_mostly = {
+	.name       = "ipvs",
+	.revision   = 0,
+	.family     = NFPROTO_UNSPEC,
+	.match      = ipvs_mt,
+	.checkentry = ipvs_mt_check,
+	.matchsize  = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)),
+	.me         = THIS_MODULE,
+};
+
+static int __init ipvs_mt_init(void)
+{
+	return xt_register_match(&xt_ipvs_mt_reg);
+}
+
+static void __exit ipvs_mt_exit(void)
+{
+	xt_unregister_match(&xt_ipvs_mt_reg);
+}
+
+module_init(ipvs_mt_init);
+module_exit(ipvs_mt_exit);
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index b4f7dfea598..70eb2b4984d 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -11,7 +11,8 @@
 #include <linux/netfilter/xt_quota.h>
 
 struct xt_quota_priv {
-	uint64_t quota;
+	spinlock_t	lock;
+	uint64_t	quota;
 };
 
 MODULE_LICENSE("GPL");
@@ -20,8 +21,6 @@ MODULE_DESCRIPTION("Xtables: countdown quota match");
 MODULE_ALIAS("ipt_quota");
 MODULE_ALIAS("ip6t_quota");
 
-static DEFINE_SPINLOCK(quota_lock);
-
 static bool
 quota_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
@@ -29,7 +28,7 @@ quota_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	struct xt_quota_priv *priv = q->master;
 	bool ret = q->flags & XT_QUOTA_INVERT;
 
-	spin_lock_bh(&quota_lock);
+	spin_lock_bh(&priv->lock);
 	if (priv->quota >= skb->len) {
 		priv->quota -= skb->len;
 		ret = !ret;
@@ -37,9 +36,7 @@ quota_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		/* we do not allow even small packets from now on */
 		priv->quota = 0;
 	}
-	/* Copy quota back to matchinfo so that iptables can display it */
-	q->quota = priv->quota;
-	spin_unlock_bh(&quota_lock);
+	spin_unlock_bh(&priv->lock);
 
 	return ret;
 }
@@ -55,6 +52,7 @@ static int quota_mt_check(const struct xt_mtchk_param *par)
 	if (q->master == NULL)
 		return -ENOMEM;
 
+	spin_lock_init(&q->master->lock);
 	q->master->quota = q->quota;
 	return 0;
 }
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 76aec6a4476..d2ff15a2412 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -567,6 +567,7 @@ static const struct file_operations recent_mt_fops = {
 	.write   = recent_mt_proc_write,
 	.release = seq_release_private,
 	.owner   = THIS_MODULE,
+	.llseek = seq_lseek,
 };
 
 static int __net_init recent_proc_net_init(struct net *net)
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index c04fcf385c5..ef36a56a02c 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -3,6 +3,7 @@
 #include <linux/skbuff.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/sctp/sctp.h>
 #include <linux/sctp.h>
 
 #include <linux/netfilter/x_tables.h>
@@ -67,7 +68,7 @@ match_packet(const struct sk_buff *skb,
 			 ++i, offset, sch->type, htons(sch->length),
 			 sch->flags);
 #endif
-		offset += (ntohs(sch->length) + 3) & ~3;
+		offset += WORD_ROUND(ntohs(sch->length));
 
 		pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset);
 
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 3d54c236a1b..00d6ae83830 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -22,6 +22,12 @@
 #include <net/netfilter/nf_tproxy_core.h>
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
 
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#define XT_SOCKET_HAVE_IPV6 1
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
 #include <linux/netfilter/xt_socket.h>
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
@@ -30,7 +36,7 @@
 #endif
 
 static int
-extract_icmp_fields(const struct sk_buff *skb,
+extract_icmp4_fields(const struct sk_buff *skb,
 		    u8 *protocol,
 		    __be32 *raddr,
 		    __be32 *laddr,
@@ -86,7 +92,6 @@ extract_icmp_fields(const struct sk_buff *skb,
 	return 0;
 }
 
-
 static bool
 socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 	     const struct xt_socket_mtinfo1 *info)
@@ -115,7 +120,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 		dport = hp->dest;
 
 	} else if (iph->protocol == IPPROTO_ICMP) {
-		if (extract_icmp_fields(skb, &protocol, &saddr, &daddr,
+		if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
 					&sport, &dport))
 			return false;
 	} else {
@@ -127,7 +132,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 	 * reply packet of an established SNAT-ted connection. */
 
 	ct = nf_ct_get(skb, &ctinfo);
-	if (ct && (ct != &nf_conntrack_untracked) &&
+	if (ct && !nf_ct_is_untracked(ct) &&
 	    ((iph->protocol != IPPROTO_ICMP &&
 	      ctinfo == IP_CT_IS_REPLY + IP_CT_ESTABLISHED) ||
 	     (iph->protocol == IPPROTO_ICMP &&
@@ -142,7 +147,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 #endif
 
 	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol,
-				   saddr, daddr, sport, dport, par->in, false);
+				   saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
 	if (sk != NULL) {
 		bool wildcard;
 		bool transparent = true;
@@ -165,32 +170,156 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 			sk = NULL;
 	}
 
-	pr_debug("proto %u %08x:%u -> %08x:%u (orig %08x:%u) sock %p\n",
-		 protocol, ntohl(saddr), ntohs(sport),
-		 ntohl(daddr), ntohs(dport),
-		 ntohl(iph->daddr), hp ? ntohs(hp->dest) : 0, sk);
+	pr_debug("proto %hhu %pI4:%hu -> %pI4:%hu (orig %pI4:%hu) sock %p\n",
+		 protocol, &saddr, ntohs(sport),
+		 &daddr, ntohs(dport),
+		 &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
 
 	return (sk != NULL);
 }
 
 static bool
-socket_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	return socket_match(skb, par, NULL);
 }
 
 static bool
-socket_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	return socket_match(skb, par, par->matchinfo);
 }
 
+#ifdef XT_SOCKET_HAVE_IPV6
+
+static int
+extract_icmp6_fields(const struct sk_buff *skb,
+		     unsigned int outside_hdrlen,
+		     int *protocol,
+		     struct in6_addr **raddr,
+		     struct in6_addr **laddr,
+		     __be16 *rport,
+		     __be16 *lport)
+{
+	struct ipv6hdr *inside_iph, _inside_iph;
+	struct icmp6hdr *icmph, _icmph;
+	__be16 *ports, _ports[2];
+	u8 inside_nexthdr;
+	int inside_hdrlen;
+
+	icmph = skb_header_pointer(skb, outside_hdrlen,
+				   sizeof(_icmph), &_icmph);
+	if (icmph == NULL)
+		return 1;
+
+	if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
+		return 1;
+
+	inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), sizeof(_inside_iph), &_inside_iph);
+	if (inside_iph == NULL)
+		return 1;
+	inside_nexthdr = inside_iph->nexthdr;
+
+	inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), &inside_nexthdr);
+	if (inside_hdrlen < 0)
+		return 1; /* hjm: Packet has no/incomplete transport layer headers. */
+
+	if (inside_nexthdr != IPPROTO_TCP &&
+	    inside_nexthdr != IPPROTO_UDP)
+		return 1;
+
+	ports = skb_header_pointer(skb, inside_hdrlen,
+				   sizeof(_ports), &_ports);
+	if (ports == NULL)
+		return 1;
+
+	/* the inside IP packet is the one quoted from our side, thus
+	 * its saddr is the local address */
+	*protocol = inside_nexthdr;
+	*laddr = &inside_iph->saddr;
+	*lport = ports[0];
+	*raddr = &inside_iph->daddr;
+	*rport = ports[1];
+
+	return 0;
+}
+
+static bool
+socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct udphdr _hdr, *hp = NULL;
+	struct sock *sk;
+	struct in6_addr *daddr, *saddr;
+	__be16 dport, sport;
+	int thoff, tproto;
+	const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+
+	tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+	if (tproto < 0) {
+		pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+		return NF_DROP;
+	}
+
+	if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
+		hp = skb_header_pointer(skb, thoff,
+					sizeof(_hdr), &_hdr);
+		if (hp == NULL)
+			return false;
+
+		saddr = &iph->saddr;
+		sport = hp->source;
+		daddr = &iph->daddr;
+		dport = hp->dest;
+
+	} else if (tproto == IPPROTO_ICMPV6) {
+		if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
+					 &sport, &dport))
+			return false;
+	} else {
+		return false;
+	}
+
+	sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+				   saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
+	if (sk != NULL) {
+		bool wildcard;
+		bool transparent = true;
+
+		/* Ignore sockets listening on INADDR_ANY */
+		wildcard = (sk->sk_state != TCP_TIME_WAIT &&
+			    ipv6_addr_any(&inet6_sk(sk)->rcv_saddr));
+
+		/* Ignore non-transparent sockets,
+		   if XT_SOCKET_TRANSPARENT is used */
+		if (info && info->flags & XT_SOCKET_TRANSPARENT)
+			transparent = ((sk->sk_state != TCP_TIME_WAIT &&
+					inet_sk(sk)->transparent) ||
+				       (sk->sk_state == TCP_TIME_WAIT &&
+					inet_twsk(sk)->tw_transparent));
+
+		nf_tproxy_put_sock(sk);
+
+		if (wildcard || !transparent)
+			sk = NULL;
+	}
+
+	pr_debug("proto %hhd %pI6:%hu -> %pI6:%hu "
+		 "(orig %pI6:%hu) sock %p\n",
+		 tproto, saddr, ntohs(sport),
+		 daddr, ntohs(dport),
+		 &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
+
+	return (sk != NULL);
+}
+#endif
+
 static struct xt_match socket_mt_reg[] __read_mostly = {
 	{
 		.name		= "socket",
 		.revision	= 0,
 		.family		= NFPROTO_IPV4,
-		.match		= socket_mt_v0,
+		.match		= socket_mt4_v0,
 		.hooks		= (1 << NF_INET_PRE_ROUTING) |
 				  (1 << NF_INET_LOCAL_IN),
 		.me		= THIS_MODULE,
@@ -199,17 +328,33 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
 		.name		= "socket",
 		.revision	= 1,
 		.family		= NFPROTO_IPV4,
-		.match		= socket_mt_v1,
+		.match		= socket_mt4_v1,
 		.matchsize	= sizeof(struct xt_socket_mtinfo1),
 		.hooks		= (1 << NF_INET_PRE_ROUTING) |
 				  (1 << NF_INET_LOCAL_IN),
 		.me		= THIS_MODULE,
 	},
+#ifdef XT_SOCKET_HAVE_IPV6
+	{
+		.name		= "socket",
+		.revision	= 1,
+		.family		= NFPROTO_IPV6,
+		.match		= socket_mt6_v1,
+		.matchsize	= sizeof(struct xt_socket_mtinfo1),
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN),
+		.me		= THIS_MODULE,
+	},
+#endif
 };
 
 static int __init socket_mt_init(void)
 {
 	nf_defrag_ipv4_enable();
+#ifdef XT_SOCKET_HAVE_IPV6
+	nf_defrag_ipv6_enable();
+#endif
+
 	return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
 }
 
@@ -225,3 +370,4 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
 MODULE_DESCRIPTION("x_tables socket match module");
 MODULE_ALIAS("ipt_socket");
+MODULE_ALIAS("ip6t_socket");
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index e12e053d378..a507922d80c 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -26,14 +26,16 @@ state_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	const struct xt_state_info *sinfo = par->matchinfo;
 	enum ip_conntrack_info ctinfo;
 	unsigned int statebit;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 
-	if (nf_ct_is_untracked(skb))
-		statebit = XT_STATE_UNTRACKED;
-	else if (!nf_ct_get(skb, &ctinfo))
+	if (!ct)
 		statebit = XT_STATE_INVALID;
-	else
-		statebit = XT_STATE_BIT(ctinfo);
-
+	else {
+		if (nf_ct_is_untracked(ct))
+			statebit = XT_STATE_UNTRACKED;
+		else
+			statebit = XT_STATE_BIT(ctinfo);
+	}
 	return (sinfo->statemask & statebit);
 }
 
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index 96e62b8fd6b..42ecb71d445 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -18,8 +18,8 @@
 #include <linux/netfilter/x_tables.h>
 
 struct xt_statistic_priv {
-	uint32_t count;
-};
+	atomic_t count;
+} ____cacheline_aligned_in_smp;
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
@@ -27,13 +27,12 @@ MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)");
 MODULE_ALIAS("ipt_statistic");
 MODULE_ALIAS("ip6t_statistic");
 
-static DEFINE_SPINLOCK(nth_lock);
-
 static bool
 statistic_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_statistic_info *info = par->matchinfo;
 	bool ret = info->flags & XT_STATISTIC_INVERT;
+	int nval, oval;
 
 	switch (info->mode) {
 	case XT_STATISTIC_MODE_RANDOM:
@@ -41,12 +40,12 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par)
 			ret = !ret;
 		break;
 	case XT_STATISTIC_MODE_NTH:
-		spin_lock_bh(&nth_lock);
-		if (info->master->count++ == info->u.nth.every) {
-			info->master->count = 0;
+		do {
+			oval = atomic_read(&info->master->count);
+			nval = (oval == info->u.nth.every) ? 0 : oval + 1;
+		} while (atomic_cmpxchg(&info->master->count, oval, nval) != oval);
+		if (nval == 0)
 			ret = !ret;
-		}
-		spin_unlock_bh(&nth_lock);
 		break;
 	}
 
@@ -64,7 +63,7 @@ static int statistic_mt_check(const struct xt_mtchk_param *par)
 	info->master = kzalloc(sizeof(*info->master), GFP_KERNEL);
 	if (info->master == NULL)
 		return -ENOMEM;
-	info->master->count = info->u.nth.count;
+	atomic_set(&info->master->count, info->u.nth.count);
 
 	return 0;
 }
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index a2eb965207d..478181d53c5 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -83,9 +83,9 @@ struct netlink_sock {
 	struct module		*module;
 };
 
-struct listeners_rcu_head {
-	struct rcu_head rcu_head;
-	void *ptr;
+struct listeners {
+	struct rcu_head		rcu;
+	unsigned long		masks[0];
 };
 
 #define NETLINK_KERNEL_SOCKET	0x1
@@ -119,7 +119,7 @@ struct nl_pid_hash {
 struct netlink_table {
 	struct nl_pid_hash hash;
 	struct hlist_head mc_list;
-	unsigned long *listeners;
+	struct listeners __rcu *listeners;
 	unsigned int nl_nonroot;
 	unsigned int groups;
 	struct mutex *cb_mutex;
@@ -338,7 +338,7 @@ netlink_update_listeners(struct sock *sk)
 			if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
 				mask |= nlk_sk(sk)->groups[i];
 		}
-		tbl->listeners[i] = mask;
+		tbl->listeners->masks[i] = mask;
 	}
 	/* this function is only called with the netlink table "grabbed", which
 	 * makes sure updates are visible before bind or setsockopt return. */
@@ -936,7 +936,7 @@ EXPORT_SYMBOL(netlink_unicast);
 int netlink_has_listeners(struct sock *sk, unsigned int group)
 {
 	int res = 0;
-	unsigned long *listeners;
+	struct listeners *listeners;
 
 	BUG_ON(!netlink_is_kernel(sk));
 
@@ -944,7 +944,7 @@ int netlink_has_listeners(struct sock *sk, unsigned int group)
 	listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);
 
 	if (group - 1 < nl_table[sk->sk_protocol].groups)
-		res = test_bit(group - 1, listeners);
+		res = test_bit(group - 1, listeners->masks);
 
 	rcu_read_unlock();
 
@@ -1076,14 +1076,15 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 pid,
 	sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list)
 		do_one_broadcast(sk, &info);
 
-	kfree_skb(skb);
+	consume_skb(skb);
 
 	netlink_unlock_table();
 
-	kfree_skb(info.skb2);
-
-	if (info.delivery_failure)
+	if (info.delivery_failure) {
+		kfree_skb(info.skb2);
 		return -ENOBUFS;
+	} else
+		consume_skb(info.skb2);
 
 	if (info.delivered) {
 		if (info.congested && (allocation & __GFP_WAIT))
@@ -1323,19 +1324,23 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	if (msg->msg_flags&MSG_OOB)
 		return -EOPNOTSUPP;
 
-	if (NULL == siocb->scm)
+	if (NULL == siocb->scm) {
 		siocb->scm = &scm;
+		memset(&scm, 0, sizeof(scm));
+	}
 	err = scm_send(sock, msg, siocb->scm);
 	if (err < 0)
 		return err;
 
 	if (msg->msg_namelen) {
+		err = -EINVAL;
 		if (addr->nl_family != AF_NETLINK)
-			return -EINVAL;
+			goto out;
 		dst_pid = addr->nl_pid;
 		dst_group = ffs(addr->nl_groups);
+		err =  -EPERM;
 		if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND))
-			return -EPERM;
+			goto out;
 	} else {
 		dst_pid = nlk->dst_pid;
 		dst_group = nlk->dst_group;
@@ -1387,6 +1392,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);
 
 out:
+	scm_destroy(siocb->scm);
 	return err;
 }
 
@@ -1400,7 +1406,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	struct netlink_sock *nlk = nlk_sk(sk);
 	int noblock = flags&MSG_DONTWAIT;
 	size_t copied;
-	struct sk_buff *skb, *frag __maybe_unused = NULL;
+	struct sk_buff *skb, *data_skb;
 	int err;
 
 	if (flags&MSG_OOB)
@@ -1412,45 +1418,35 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	if (skb == NULL)
 		goto out;
 
+	data_skb = skb;
+
 #ifdef CONFIG_COMPAT_NETLINK_MESSAGES
 	if (unlikely(skb_shinfo(skb)->frag_list)) {
-		bool need_compat = !!(flags & MSG_CMSG_COMPAT);
-
 		/*
-		 * If this skb has a frag_list, then here that means that
-		 * we will have to use the frag_list skb for compat tasks
-		 * and the regular skb for non-compat tasks.
+		 * If this skb has a frag_list, then here that means that we
+		 * will have to use the frag_list skb's data for compat tasks
+		 * and the regular skb's data for normal (non-compat) tasks.
 		 *
-		 * The skb might (and likely will) be cloned, so we can't
-		 * just reset frag_list and go on with things -- we need to
-		 * keep that. For the compat case that's easy -- simply get
-		 * a reference to the compat skb and free the regular one
-		 * including the frag. For the non-compat case, we need to
-		 * avoid sending the frag to the user -- so assign NULL but
-		 * restore it below before freeing the skb.
+		 * If we need to send the compat skb, assign it to the
+		 * 'data_skb' variable so that it will be used below for data
+		 * copying. We keep 'skb' for everything else, including
+		 * freeing both later.
 		 */
-		if (need_compat) {
-			struct sk_buff *compskb = skb_shinfo(skb)->frag_list;
-			skb_get(compskb);
-			kfree_skb(skb);
-			skb = compskb;
-		} else {
-			frag = skb_shinfo(skb)->frag_list;
-			skb_shinfo(skb)->frag_list = NULL;
-		}
+		if (flags & MSG_CMSG_COMPAT)
+			data_skb = skb_shinfo(skb)->frag_list;
 	}
 #endif
 
 	msg->msg_namelen = 0;
 
-	copied = skb->len;
+	copied = data_skb->len;
 	if (len < copied) {
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
 
-	skb_reset_transport_header(skb);
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	skb_reset_transport_header(data_skb);
+	err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied);
 
 	if (msg->msg_name) {
 		struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name;
@@ -1470,11 +1466,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	}
 	siocb->scm->creds = *NETLINK_CREDS(skb);
 	if (flags & MSG_TRUNC)
-		copied = skb->len;
-
-#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
-	skb_shinfo(skb)->frag_list = frag;
-#endif
+		copied = data_skb->len;
 
 	skb_free_datagram(sk, skb);
 
@@ -1506,7 +1498,7 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups,
 	struct socket *sock;
 	struct sock *sk;
 	struct netlink_sock *nlk;
-	unsigned long *listeners = NULL;
+	struct listeners *listeners = NULL;
 
 	BUG_ON(!nl_table);
 
@@ -1531,8 +1523,7 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups,
 	if (groups < 32)
 		groups = 32;
 
-	listeners = kzalloc(NLGRPSZ(groups) + sizeof(struct listeners_rcu_head),
-			    GFP_KERNEL);
+	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
 	if (!listeners)
 		goto out_sock_release;
 
@@ -1549,7 +1540,7 @@ netlink_kernel_create(struct net *net, int unit, unsigned int groups,
 	netlink_table_grab();
 	if (!nl_table[unit].registered) {
 		nl_table[unit].groups = groups;
-		nl_table[unit].listeners = listeners;
+		rcu_assign_pointer(nl_table[unit].listeners, listeners);
 		nl_table[unit].cb_mutex = cb_mutex;
 		nl_table[unit].module = module;
 		nl_table[unit].registered = 1;
@@ -1580,43 +1571,28 @@ netlink_kernel_release(struct sock *sk)
 EXPORT_SYMBOL(netlink_kernel_release);
 
 
-static void netlink_free_old_listeners(struct rcu_head *rcu_head)
+static void listeners_free_rcu(struct rcu_head *head)
 {
-	struct listeners_rcu_head *lrh;
-
-	lrh = container_of(rcu_head, struct listeners_rcu_head, rcu_head);
-	kfree(lrh->ptr);
+	kfree(container_of(head, struct listeners, rcu));
 }
 
 int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
 {
-	unsigned long *listeners, *old = NULL;
-	struct listeners_rcu_head *old_rcu_head;
+	struct listeners *new, *old;
 	struct netlink_table *tbl = &nl_table[sk->sk_protocol];
 
 	if (groups < 32)
 		groups = 32;
 
 	if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
-		listeners = kzalloc(NLGRPSZ(groups) +
-				    sizeof(struct listeners_rcu_head),
-				    GFP_ATOMIC);
-		if (!listeners)
+		new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC);
+		if (!new)
 			return -ENOMEM;
-		old = tbl->listeners;
-		memcpy(listeners, old, NLGRPSZ(tbl->groups));
-		rcu_assign_pointer(tbl->listeners, listeners);
-		/*
-		 * Free the old memory after an RCU grace period so we
-		 * don't leak it. We use call_rcu() here in order to be
-		 * able to call this function from atomic contexts. The
-		 * allocation of this memory will have reserved enough
-		 * space for struct listeners_rcu_head at the end.
-		 */
-		old_rcu_head = (void *)(tbl->listeners +
-					NLGRPLONGS(tbl->groups));
-		old_rcu_head->ptr = old;
-		call_rcu(&old_rcu_head->rcu_head, netlink_free_old_listeners);
+		old = rcu_dereference_raw(tbl->listeners);
+		memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups));
+		rcu_assign_pointer(tbl->listeners, new);
+
+		call_rcu(&old->rcu, listeners_free_rcu);
 	}
 	tbl->groups = groups;
 
@@ -2110,6 +2086,25 @@ static void __net_exit netlink_net_exit(struct net *net)
 #endif
 }
 
+static void __init netlink_add_usersock_entry(void)
+{
+	struct listeners *listeners;
+	int groups = 32;
+
+	listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
+	if (!listeners)
+		panic("netlink_add_usersock_entry: Cannot allocate listeners\n");
+
+	netlink_table_grab();
+
+	nl_table[NETLINK_USERSOCK].groups = groups;
+	rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
+	nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
+	nl_table[NETLINK_USERSOCK].registered = 1;
+
+	netlink_table_ungrab();
+}
+
 static struct pernet_operations __net_initdata netlink_net_ops = {
 	.init = netlink_net_init,
 	.exit = netlink_net_exit,
@@ -2158,6 +2153,8 @@ static int __init netlink_proto_init(void)
 		hash->rehash_time = jiffies;
 	}
 
+	netlink_add_usersock_entry();
+
 	sock_register(&netlink_family_ops);
 	register_pernet_subsys(&netlink_net_ops);
 	/* The netlink device handler may be needed early. */
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index aa4308afcc7..1781d99145e 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -303,6 +303,7 @@ int genl_register_ops(struct genl_family *family, struct genl_ops *ops)
 errout:
 	return err;
 }
+EXPORT_SYMBOL(genl_register_ops);
 
 /**
  * genl_unregister_ops - unregister generic netlink operations
@@ -337,6 +338,7 @@ int genl_unregister_ops(struct genl_family *family, struct genl_ops *ops)
 
 	return -ENOENT;
 }
+EXPORT_SYMBOL(genl_unregister_ops);
 
 /**
  * genl_register_family - register a generic netlink family
@@ -405,6 +407,7 @@ errout_locked:
 errout:
 	return err;
 }
+EXPORT_SYMBOL(genl_register_family);
 
 /**
  * genl_register_family_with_ops - register a generic netlink family
@@ -485,6 +488,7 @@ int genl_unregister_family(struct genl_family *family)
 
 	return -ENOENT;
 }
+EXPORT_SYMBOL(genl_unregister_family);
 
 static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
@@ -543,8 +547,20 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN;
 	info.attrs = family->attrbuf;
 	genl_info_net_set(&info, net);
+	memset(&info.user_ptr, 0, sizeof(info.user_ptr));
+
+	if (family->pre_doit) {
+		err = family->pre_doit(ops, skb, &info);
+		if (err)
+			return err;
+	}
+
+	err = ops->doit(skb, &info);
+
+	if (family->post_doit)
+		family->post_doit(ops, skb, &info);
 
-	return ops->doit(skb, &info);
+	return err;
 }
 
 static void genl_rcv(struct sk_buff *skb)
@@ -873,11 +889,7 @@ static int __init genl_init(void)
 	for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
 		INIT_LIST_HEAD(&family_ht[i]);
 
-	err = genl_register_family(&genl_ctrl);
-	if (err < 0)
-		goto problem;
-
-	err = genl_register_ops(&genl_ctrl, &genl_ctrl_ops);
+	err = genl_register_family_with_ops(&genl_ctrl, &genl_ctrl_ops, 1);
 	if (err < 0)
 		goto problem;
 
@@ -899,11 +911,6 @@ problem:
 
 subsys_initcall(genl_init);
 
-EXPORT_SYMBOL(genl_register_ops);
-EXPORT_SYMBOL(genl_unregister_ops);
-EXPORT_SYMBOL(genl_register_family);
-EXPORT_SYMBOL(genl_unregister_family);
-
 static int genlmsg_mcast(struct sk_buff *skb, u32 pid, unsigned long group,
 			 gfp_t flags)
 {
diff --git a/net/nonet.c b/net/nonet.c
index 92e76640c7c..b1a73fda9c1 100644
--- a/net/nonet.c
+++ b/net/nonet.c
@@ -22,4 +22,5 @@ static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
 const struct file_operations bad_sock_fops = {
 	.owner = THIS_MODULE,
 	.open = sock_no_open,
+	.llseek = noop_llseek,
 };
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 2078a277e06..3616f27b9d4 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -83,6 +83,7 @@
 #include <linux/if_vlan.h>
 #include <linux/virtio_net.h>
 #include <linux/errqueue.h>
+#include <linux/net_tstamp.h>
 
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
@@ -202,6 +203,7 @@ struct packet_sock {
 	unsigned int		tp_hdrlen;
 	unsigned int		tp_reserve;
 	unsigned int		tp_loss:1;
+	unsigned int		tp_tstamp;
 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
 };
 
@@ -486,7 +488,7 @@ retry:
 	skb->dev = dev;
 	skb->priority = sk->sk_priority;
 	skb->mark = sk->sk_mark;
-	err = sock_tx_timestamp(msg, sk, skb_tx(skb));
+	err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
 	if (err < 0)
 		goto out_unlock;
 
@@ -656,6 +658,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	struct sk_buff *copy_skb = NULL;
 	struct timeval tv;
 	struct timespec ts;
+	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
 
 	if (skb->pkt_type == PACKET_LOOPBACK)
 		goto drop;
@@ -737,7 +740,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		h.h1->tp_snaplen = snaplen;
 		h.h1->tp_mac = macoff;
 		h.h1->tp_net = netoff;
-		if (skb->tstamp.tv64)
+		if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
+				&& shhwtstamps->syststamp.tv64)
+			tv = ktime_to_timeval(shhwtstamps->syststamp);
+		else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
+				&& shhwtstamps->hwtstamp.tv64)
+			tv = ktime_to_timeval(shhwtstamps->hwtstamp);
+		else if (skb->tstamp.tv64)
 			tv = ktime_to_timeval(skb->tstamp);
 		else
 			do_gettimeofday(&tv);
@@ -750,7 +759,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		h.h2->tp_snaplen = snaplen;
 		h.h2->tp_mac = macoff;
 		h.h2->tp_net = netoff;
-		if (skb->tstamp.tv64)
+		if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
+				&& shhwtstamps->syststamp.tv64)
+			ts = ktime_to_timespec(shhwtstamps->syststamp);
+		else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
+				&& shhwtstamps->hwtstamp.tv64)
+			ts = ktime_to_timespec(shhwtstamps->hwtstamp);
+		else if (skb->tstamp.tv64)
 			ts = ktime_to_timespec(skb->tstamp);
 		else
 			getnstimeofday(&ts);
@@ -1194,7 +1209,7 @@ static int packet_snd(struct socket *sock,
 	err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
 	if (err)
 		goto out_free;
-	err = sock_tx_timestamp(msg, sk, skb_tx(skb));
+	err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
 	if (err < 0)
 		goto out_free;
 
@@ -2027,6 +2042,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 		po->has_vnet_hdr = !!val;
 		return 0;
 	}
+	case PACKET_TIMESTAMP:
+	{
+		int val;
+
+		if (optlen != sizeof(val))
+			return -EINVAL;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+
+		po->tp_tstamp = val;
+		return 0;
+	}
 	default:
 		return -ENOPROTOOPT;
 	}
@@ -2119,6 +2146,12 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 		val = po->tp_loss;
 		data = &val;
 		break;
+	case PACKET_TIMESTAMP:
+		if (len > sizeof(int))
+			len = sizeof(int);
+		val = po->tp_tstamp;
+		data = &val;
+		break;
 	default:
 		return -ENOPROTOOPT;
 	}
diff --git a/net/phonet/Kconfig b/net/phonet/Kconfig
index 6ec7d55b176..0d9b8a220a7 100644
--- a/net/phonet/Kconfig
+++ b/net/phonet/Kconfig
@@ -14,3 +14,15 @@ config PHONET
 
 	  To compile this driver as a module, choose M here: the module
 	  will be called phonet. If unsure, say N.
+
+config PHONET_PIPECTRLR
+	bool "Phonet Pipe Controller (EXPERIMENTAL)"
+	depends on PHONET && EXPERIMENTAL
+	default N
+	help
+	  The Pipe Controller implementation in Phonet stack to support Pipe
+	  data with Nokia Slim modems like WG2.5 used on ST-Ericsson U8500
+	  platform.
+
+	  This option is incompatible with older Nokia modems.
+	  Say N here unless you really know what you are doing.
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 73aee7f2fcd..fd95beb72f5 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -251,6 +251,16 @@ int pn_skb_send(struct sock *sk, struct sk_buff *skb,
 	else if (phonet_address_lookup(net, daddr) == 0) {
 		dev = phonet_device_get(net);
 		skb->pkt_type = PACKET_LOOPBACK;
+	} else if (pn_sockaddr_get_object(target) == 0) {
+		/* Resource routing (small race until phonet_rcv()) */
+		struct sock *sk = pn_find_sock_by_res(net,
+							target->spn_resource);
+		if (sk)	{
+			sock_put(sk);
+			dev = phonet_device_get(net);
+			skb->pkt_type = PACKET_LOOPBACK;
+		} else
+			dev = phonet_route_output(net, daddr);
 	} else
 		dev = phonet_route_output(net, daddr);
 
@@ -383,6 +393,13 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev,
 		goto out;
 	}
 
+	/* resource routing */
+	if (pn_sockaddr_get_object(&sa) == 0) {
+		struct sock *sk = pn_find_sock_by_res(net, sa.spn_resource);
+		if (sk)
+			return sk_receive_skb(sk, skb, 0);
+	}
+
 	/* check if we are the destination */
 	if (phonet_address_lookup(net, pn_sockaddr_get_addr(&sa)) == 0) {
 		/* Phonet packet input */
diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c
index 1bd38db4fe1..2f032381bd4 100644
--- a/net/phonet/datagram.c
+++ b/net/phonet/datagram.c
@@ -52,6 +52,19 @@ static int pn_ioctl(struct sock *sk, int cmd, unsigned long arg)
 		answ = skb ? skb->len : 0;
 		release_sock(sk);
 		return put_user(answ, (int __user *)arg);
+
+	case SIOCPNADDRESOURCE:
+	case SIOCPNDELRESOURCE: {
+			u32 res;
+			if (get_user(res, (u32 __user *)arg))
+				return -EFAULT;
+			if (res >= 256)
+				return -EINVAL;
+			if (cmd == SIOCPNADDRESOURCE)
+				return pn_sock_bind_res(sk, res);
+			else
+				return pn_sock_unbind_res(sk, res);
+		}
 	}
 
 	return -ENOIOCTLCMD;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index b2a3ae6cad7..3e60f2e4e6c 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -109,6 +109,210 @@ static int pep_reply(struct sock *sk, struct sk_buff *oskb,
 }
 
 #define PAD 0x00
+
+#ifdef CONFIG_PHONET_PIPECTRLR
+static u8 pipe_negotiate_fc(u8 *host_fc, u8 *remote_fc, int len)
+{
+	int i, j;
+	u8 base_fc, final_fc;
+
+	for (i = 0; i < len; i++) {
+		base_fc = host_fc[i];
+		for (j = 0; j < len; j++) {
+			if (remote_fc[j] == base_fc) {
+				final_fc = base_fc;
+				goto done;
+			}
+		}
+	}
+	return -EINVAL;
+
+done:
+	return final_fc;
+
+}
+
+static int pipe_get_flow_info(struct sock *sk, struct sk_buff *skb,
+		u8 *pref_rx_fc, u8 *req_tx_fc)
+{
+	struct pnpipehdr *hdr;
+	u8 n_sb;
+
+	if (!pskb_may_pull(skb, sizeof(*hdr) + 4))
+		return -EINVAL;
+
+	hdr = pnp_hdr(skb);
+	n_sb = hdr->data[4];
+
+	__skb_pull(skb, sizeof(*hdr) + 4);
+	while (n_sb > 0) {
+		u8 type, buf[3], len = sizeof(buf);
+		u8 *data = pep_get_sb(skb, &type, &len, buf);
+
+		if (data == NULL)
+			return -EINVAL;
+
+		switch (type) {
+		case PN_PIPE_SB_REQUIRED_FC_TX:
+			if (len < 3 || (data[2] | data[3] | data[4]) > 3)
+				break;
+			req_tx_fc[0] = data[2];
+			req_tx_fc[1] = data[3];
+			req_tx_fc[2] = data[4];
+			break;
+
+		case PN_PIPE_SB_PREFERRED_FC_RX:
+			if (len < 3 || (data[2] | data[3] | data[4]) > 3)
+				break;
+			pref_rx_fc[0] = data[2];
+			pref_rx_fc[1] = data[3];
+			pref_rx_fc[2] = data[4];
+			break;
+
+		}
+		n_sb--;
+	}
+	return 0;
+}
+
+static int pipe_handler_send_req(struct sock *sk, u8 utid,
+		u8 msg_id, gfp_t priority)
+{
+	int len;
+	struct pnpipehdr *ph;
+	struct sk_buff *skb;
+	struct pep_sock *pn = pep_sk(sk);
+
+	static const u8 data[4] = {
+		PAD, PAD, PAD, PAD,
+	};
+
+	switch (msg_id) {
+	case PNS_PEP_CONNECT_REQ:
+		len = sizeof(data);
+		break;
+
+	case PNS_PEP_DISCONNECT_REQ:
+	case PNS_PEP_ENABLE_REQ:
+	case PNS_PEP_DISABLE_REQ:
+		len = 0;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	skb = alloc_skb(MAX_PNPIPE_HEADER + len, priority);
+	if (!skb)
+		return -ENOMEM;
+	skb_set_owner_w(skb, sk);
+
+	skb_reserve(skb, MAX_PNPIPE_HEADER);
+	if (len) {
+		__skb_put(skb, len);
+		skb_copy_to_linear_data(skb, data, len);
+	}
+	__skb_push(skb, sizeof(*ph));
+	skb_reset_transport_header(skb);
+	ph = pnp_hdr(skb);
+	ph->utid = utid;
+	ph->message_id = msg_id;
+	ph->pipe_handle = pn->pipe_handle;
+	ph->error_code = PN_PIPE_NO_ERROR;
+
+	return pn_skb_send(sk, skb, &pn->remote_pep);
+}
+
+static int pipe_handler_send_created_ind(struct sock *sk,
+		u8 utid, u8 msg_id)
+{
+	int err_code;
+	struct pnpipehdr *ph;
+	struct sk_buff *skb;
+
+	struct pep_sock *pn = pep_sk(sk);
+	static u8 data[4] = {
+		0x03, 0x04,
+	};
+	data[2] = pn->tx_fc;
+	data[3] = pn->rx_fc;
+
+	/*
+	 * actually, below is number of sub-blocks and not error code.
+	 * Pipe_created_ind message format does not have any
+	 * error code field. However, the Phonet stack will always send
+	 * an error code as part of pnpipehdr. So, use that err_code to
+	 * specify the number of sub-blocks.
+	 */
+	err_code = 0x01;
+
+	skb = alloc_skb(MAX_PNPIPE_HEADER + sizeof(data), GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+	skb_set_owner_w(skb, sk);
+
+	skb_reserve(skb, MAX_PNPIPE_HEADER);
+	__skb_put(skb, sizeof(data));
+	skb_copy_to_linear_data(skb, data, sizeof(data));
+	__skb_push(skb, sizeof(*ph));
+	skb_reset_transport_header(skb);
+	ph = pnp_hdr(skb);
+	ph->utid = utid;
+	ph->message_id = msg_id;
+	ph->pipe_handle = pn->pipe_handle;
+	ph->error_code = err_code;
+
+	return pn_skb_send(sk, skb, &pn->remote_pep);
+}
+
+static int pipe_handler_send_ind(struct sock *sk, u8 utid, u8 msg_id)
+{
+	int err_code;
+	struct pnpipehdr *ph;
+	struct sk_buff *skb;
+	struct pep_sock *pn = pep_sk(sk);
+
+	/*
+	 * actually, below is a filler.
+	 * Pipe_enabled/disabled_ind message format does not have any
+	 * error code field. However, the Phonet stack will always send
+	 * an error code as part of pnpipehdr. So, use that err_code to
+	 * specify the filler value.
+	 */
+	err_code = 0x0;
+
+	skb = alloc_skb(MAX_PNPIPE_HEADER, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+	skb_set_owner_w(skb, sk);
+
+	skb_reserve(skb, MAX_PNPIPE_HEADER);
+	__skb_push(skb, sizeof(*ph));
+	skb_reset_transport_header(skb);
+	ph = pnp_hdr(skb);
+	ph->utid = utid;
+	ph->message_id = msg_id;
+	ph->pipe_handle = pn->pipe_handle;
+	ph->error_code = err_code;
+
+	return pn_skb_send(sk, skb, &pn->remote_pep);
+}
+
+static int pipe_handler_enable_pipe(struct sock *sk, int enable)
+{
+	int utid, req;
+
+	if (enable) {
+		utid = PNS_PIPE_ENABLE_UTID;
+		req = PNS_PEP_ENABLE_REQ;
+	} else {
+		utid = PNS_PIPE_DISABLE_UTID;
+		req = PNS_PEP_DISABLE_REQ;
+	}
+	return pipe_handler_send_req(sk, utid, req, GFP_ATOMIC);
+}
+#endif
+
 static int pep_accept_conn(struct sock *sk, struct sk_buff *skb)
 {
 	static const u8 data[20] = {
@@ -192,7 +396,11 @@ static int pipe_snd_status(struct sock *sk, u8 type, u8 status, gfp_t priority)
 	ph->data[3] = PAD;
 	ph->data[4] = status;
 
+#ifdef CONFIG_PHONET_PIPECTRLR
+	return pn_skb_send(sk, skb, &pn->remote_pep);
+#else
 	return pn_skb_send(sk, skb, &pipe_srv);
+#endif
 }
 
 /* Send our RX flow control information to the sender.
@@ -225,12 +433,13 @@ static void pipe_grant_credits(struct sock *sk)
 static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb)
 {
 	struct pep_sock *pn = pep_sk(sk);
-	struct pnpipehdr *hdr = pnp_hdr(skb);
+	struct pnpipehdr *hdr;
 	int wake = 0;
 
 	if (!pskb_may_pull(skb, sizeof(*hdr) + 4))
 		return -EINVAL;
 
+	hdr = pnp_hdr(skb);
 	if (hdr->data[0] != PN_PEP_TYPE_COMMON) {
 		LIMIT_NETDEBUG(KERN_DEBUG"Phonet unknown PEP type: %u\n",
 				(unsigned)hdr->data[0]);
@@ -323,11 +532,35 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
 			sk->sk_state_change(sk);
 		break;
 
+#ifdef CONFIG_PHONET_PIPECTRLR
+	case PNS_PEP_DISCONNECT_RESP:
+		pn->pipe_state = PIPE_IDLE;
+		sk->sk_state = TCP_CLOSE;
+		break;
+#endif
+
 	case PNS_PEP_ENABLE_REQ:
 		/* Wait for PNS_PIPE_(ENABLED|REDIRECTED)_IND */
 		pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
 		break;
 
+#ifdef CONFIG_PHONET_PIPECTRLR
+	case PNS_PEP_ENABLE_RESP:
+		pn->pipe_state = PIPE_ENABLED;
+		pipe_handler_send_ind(sk, PNS_PIPE_ENABLED_IND_UTID,
+				PNS_PIPE_ENABLED_IND);
+
+		if (!pn_flow_safe(pn->tx_fc)) {
+			atomic_set(&pn->tx_credits, 1);
+			sk->sk_write_space(sk);
+		}
+		if (sk->sk_state == TCP_ESTABLISHED)
+			break; /* Nothing to do */
+		sk->sk_state = TCP_ESTABLISHED;
+		pipe_grant_credits(sk);
+		break;
+#endif
+
 	case PNS_PEP_RESET_REQ:
 		switch (hdr->state_after_reset) {
 		case PN_PIPE_DISABLE:
@@ -346,6 +579,17 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
 		pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
 		break;
 
+#ifdef CONFIG_PHONET_PIPECTRLR
+	case PNS_PEP_DISABLE_RESP:
+		pn->pipe_state = PIPE_DISABLED;
+		atomic_set(&pn->tx_credits, 0);
+		pipe_handler_send_ind(sk, PNS_PIPE_DISABLED_IND_UTID,
+				PNS_PIPE_DISABLED_IND);
+		sk->sk_state = TCP_SYN_RECV;
+		pn->rx_credits = 0;
+		break;
+#endif
+
 	case PNS_PEP_CTRL_REQ:
 		if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) {
 			atomic_inc(&sk->sk_drops);
@@ -437,6 +681,42 @@ static void pipe_destruct(struct sock *sk)
 	skb_queue_purge(&pn->ctrlreq_queue);
 }
 
+#ifdef CONFIG_PHONET_PIPECTRLR
+static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	u8 host_pref_rx_fc[3] = {3, 2, 1}, host_req_tx_fc[3] = {3, 2, 1};
+	u8 remote_pref_rx_fc[3], remote_req_tx_fc[3];
+	u8 negotiated_rx_fc, negotiated_tx_fc;
+	int ret;
+
+	pipe_get_flow_info(sk, skb, remote_pref_rx_fc,
+			remote_req_tx_fc);
+	negotiated_tx_fc = pipe_negotiate_fc(remote_req_tx_fc,
+			host_pref_rx_fc,
+			sizeof(host_pref_rx_fc));
+	negotiated_rx_fc = pipe_negotiate_fc(host_req_tx_fc,
+			remote_pref_rx_fc,
+			sizeof(host_pref_rx_fc));
+
+	pn->pipe_state = PIPE_DISABLED;
+	sk->sk_state = TCP_SYN_RECV;
+	sk->sk_backlog_rcv = pipe_do_rcv;
+	sk->sk_destruct = pipe_destruct;
+	pn->rx_credits = 0;
+	pn->rx_fc = negotiated_rx_fc;
+	pn->tx_fc = negotiated_tx_fc;
+	sk->sk_state_change(sk);
+
+	ret = pipe_handler_send_created_ind(sk,
+			PNS_PIPE_CREATED_IND_UTID,
+			PNS_PIPE_CREATED_IND
+			);
+
+	return ret;
+}
+#endif
+
 static int pep_connreq_rcv(struct sock *sk, struct sk_buff *skb)
 {
 	struct sock *newsk;
@@ -600,6 +880,12 @@ static int pep_do_rcv(struct sock *sk, struct sk_buff *skb)
 		err = pep_connreq_rcv(sk, skb);
 		break;
 
+#ifdef CONFIG_PHONET_PIPECTRLR
+	case PNS_PEP_CONNECT_RESP:
+		err = pep_connresp_rcv(sk, skb);
+		break;
+#endif
+
 	case PNS_PEP_DISCONNECT_REQ:
 		pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC);
 		break;
@@ -620,6 +906,28 @@ drop:
 	return err;
 }
 
+static int pipe_do_remove(struct sock *sk)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct pnpipehdr *ph;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(MAX_PNPIPE_HEADER, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	skb_reserve(skb, MAX_PNPIPE_HEADER);
+	__skb_push(skb, sizeof(*ph));
+	skb_reset_transport_header(skb);
+	ph = pnp_hdr(skb);
+	ph->utid = 0;
+	ph->message_id = PNS_PIPE_REMOVE_REQ;
+	ph->pipe_handle = pn->pipe_handle;
+	ph->data[0] = PAD;
+
+	return pn_skb_send(sk, skb, &pipe_srv);
+}
+
 /* associated socket ceases to exist */
 static void pep_sock_close(struct sock *sk, long timeout)
 {
@@ -638,7 +946,22 @@ static void pep_sock_close(struct sock *sk, long timeout)
 		sk_for_each_safe(sknode, p, n, &pn->ackq)
 			sk_del_node_init(sknode);
 		sk->sk_state = TCP_CLOSE;
+	} else if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED))
+		/* Forcefully remove dangling Phonet pipe */
+		pipe_do_remove(sk);
+
+#ifdef CONFIG_PHONET_PIPECTRLR
+	if (pn->pipe_state != PIPE_IDLE) {
+		/* send pep disconnect request */
+		pipe_handler_send_req(sk,
+				PNS_PEP_DISCONNECT_UTID, PNS_PEP_DISCONNECT_REQ,
+				GFP_KERNEL);
+
+		pn->pipe_state = PIPE_IDLE;
+		sk->sk_state = TCP_CLOSE;
 	}
+#endif
+
 	ifindex = pn->ifindex;
 	pn->ifindex = 0;
 	release_sock(sk);
@@ -715,6 +1038,20 @@ out:
 	return newsk;
 }
 
+#ifdef CONFIG_PHONET_PIPECTRLR
+static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len)
+{
+	struct pep_sock *pn = pep_sk(sk);
+	struct sockaddr_pn *spn =  (struct sockaddr_pn *)addr;
+
+	memcpy(&pn->remote_pep, spn, sizeof(struct sockaddr_pn));
+
+	return pipe_handler_send_req(sk,
+			PNS_PEP_CONNECT_UTID, PNS_PEP_CONNECT_REQ,
+			GFP_ATOMIC);
+}
+#endif
+
 static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
 	struct pep_sock *pn = pep_sk(sk);
@@ -766,6 +1103,18 @@ static int pep_setsockopt(struct sock *sk, int level, int optname,
 
 	lock_sock(sk);
 	switch (optname) {
+#ifdef CONFIG_PHONET_PIPECTRLR
+	case PNPIPE_PIPE_HANDLE:
+		if (val) {
+			if (pn->pipe_state > PIPE_IDLE) {
+				err = -EFAULT;
+				break;
+			}
+			pn->pipe_handle = val;
+			break;
+		}
+#endif
+
 	case PNPIPE_ENCAP:
 		if (val && val != PNPIPE_ENCAP_IP) {
 			err = -EINVAL;
@@ -791,6 +1140,17 @@ static int pep_setsockopt(struct sock *sk, int level, int optname,
 			err = 0;
 		}
 		goto out_norel;
+
+#ifdef CONFIG_PHONET_PIPECTRLR
+	case PNPIPE_ENABLE:
+		if (pn->pipe_state <= PIPE_IDLE) {
+			err = -ENOTCONN;
+			break;
+		}
+		err = pipe_handler_enable_pipe(sk, val);
+		break;
+#endif
+
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -815,9 +1175,19 @@ static int pep_getsockopt(struct sock *sk, int level, int optname,
 	case PNPIPE_ENCAP:
 		val = pn->ifindex ? PNPIPE_ENCAP_IP : PNPIPE_ENCAP_NONE;
 		break;
+
 	case PNPIPE_IFINDEX:
 		val = pn->ifindex;
 		break;
+
+#ifdef CONFIG_PHONET_PIPECTRLR
+	case PNPIPE_ENABLE:
+		if (pn->pipe_state <= PIPE_IDLE)
+			return -ENOTCONN;
+		val = pn->pipe_state != PIPE_DISABLED;
+		break;
+#endif
+
 	default:
 		return -ENOPROTOOPT;
 	}
@@ -834,6 +1204,7 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb)
 {
 	struct pep_sock *pn = pep_sk(sk);
 	struct pnpipehdr *ph;
+	int err;
 
 	if (pn_flow_safe(pn->tx_fc) &&
 	    !atomic_add_unless(&pn->tx_credits, -1, 0)) {
@@ -851,8 +1222,16 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb)
 	} else
 		ph->message_id = PNS_PIPE_DATA;
 	ph->pipe_handle = pn->pipe_handle;
+#ifdef CONFIG_PHONET_PIPECTRLR
+	err = pn_skb_send(sk, skb, &pn->remote_pep);
+#else
+	err = pn_skb_send(sk, skb, &pipe_srv);
+#endif
+
+	if (err && pn_flow_safe(pn->tx_fc))
+		atomic_inc(&pn->tx_credits);
+	return err;
 
-	return pn_skb_send(sk, skb, &pipe_srv);
 }
 
 static int pep_sendmsg(struct kiocb *iocb, struct sock *sk,
@@ -872,7 +1251,7 @@ static int pep_sendmsg(struct kiocb *iocb, struct sock *sk,
 	skb = sock_alloc_send_skb(sk, MAX_PNPIPE_HEADER + len,
 					flags & MSG_DONTWAIT, &err);
 	if (!skb)
-		return -ENOBUFS;
+		return err;
 
 	skb_reserve(skb, MAX_PHONET_HEADER + 3);
 	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
@@ -1044,6 +1423,8 @@ static void pep_sock_unhash(struct sock *sk)
 	struct sock *skparent = NULL;
 
 	lock_sock(sk);
+
+#ifndef CONFIG_PHONET_PIPECTRLR
 	if ((1 << sk->sk_state) & ~(TCPF_CLOSE|TCPF_LISTEN)) {
 		skparent = pn->listener;
 		release_sock(sk);
@@ -1053,6 +1434,7 @@ static void pep_sock_unhash(struct sock *sk)
 		sk_del_node_init(sk);
 		sk = skparent;
 	}
+#endif
 	/* Unhash a listening sock only when it is closed
 	 * and all of its active connected pipes are closed. */
 	if (hlist_empty(&pn->hlist))
@@ -1066,6 +1448,9 @@ static void pep_sock_unhash(struct sock *sk)
 static struct proto pep_proto = {
 	.close		= pep_sock_close,
 	.accept		= pep_sock_accept,
+#ifdef CONFIG_PHONET_PIPECTRLR
+	.connect	= pep_sock_connect,
+#endif
 	.ioctl		= pep_ioctl,
 	.init		= pep_init,
 	.setsockopt	= pep_setsockopt,
diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c
index c33da657694..947038ddd04 100644
--- a/net/phonet/pn_dev.c
+++ b/net/phonet/pn_dev.c
@@ -162,6 +162,14 @@ int phonet_address_add(struct net_device *dev, u8 addr)
 	return err;
 }
 
+static void phonet_device_rcu_free(struct rcu_head *head)
+{
+	struct phonet_device *pnd;
+
+	pnd = container_of(head, struct phonet_device, rcu);
+	kfree(pnd);
+}
+
 int phonet_address_del(struct net_device *dev, u8 addr)
 {
 	struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev));
@@ -179,10 +187,9 @@ int phonet_address_del(struct net_device *dev, u8 addr)
 		pnd = NULL;
 	mutex_unlock(&pndevs->lock);
 
-	if (pnd) {
-		synchronize_rcu();
-		kfree(pnd);
-	}
+	if (pnd)
+		call_rcu(&pnd->rcu, phonet_device_rcu_free);
+
 	return err;
 }
 
@@ -285,8 +292,7 @@ static void phonet_route_autodel(struct net_device *dev)
 	if (bitmap_empty(deleted, 64))
 		return; /* short-circuit RCU */
 	synchronize_rcu();
-	for (i = find_first_bit(deleted, 64); i < 64;
-			i = find_next_bit(deleted, 64, i + 1)) {
+	for_each_set_bit(i, deleted, 64) {
 		rtm_phonet_notify(RTM_DELROUTE, dev, i);
 		dev_put(dev);
 	}
@@ -367,6 +373,7 @@ int __init phonet_device_init(void)
 	if (err)
 		return err;
 
+	proc_net_fops_create(&init_net, "pnresource", 0, &pn_res_seq_fops);
 	register_netdevice_notifier(&phonet_device_notifier);
 	err = phonet_netlink_register();
 	if (err)
@@ -379,6 +386,7 @@ void phonet_device_exit(void)
 	rtnl_unregister_all(PF_PHONET);
 	unregister_netdevice_notifier(&phonet_device_notifier);
 	unregister_pernet_device(&phonet_net_ops);
+	proc_net_remove(&init_net, "pnresource");
 }
 
 int phonet_route_add(struct net_device *dev, u8 daddr)
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 6e9848bf037..25f746d20c1 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -158,6 +158,7 @@ void pn_sock_unhash(struct sock *sk)
 	spin_lock_bh(&pnsocks.lock);
 	sk_del_node_init(sk);
 	spin_unlock_bh(&pnsocks.lock);
+	pn_sock_unbind_all_res(sk);
 }
 EXPORT_SYMBOL(pn_sock_unhash);
 
@@ -224,6 +225,101 @@ static int pn_socket_autobind(struct socket *sock)
 	return 0; /* socket was already bound */
 }
 
+#ifdef CONFIG_PHONET_PIPECTRLR
+static int pn_socket_connect(struct socket *sock, struct sockaddr *addr,
+		int len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_pn *spn = (struct sockaddr_pn *)addr;
+	long timeo;
+	int err;
+
+	if (len < sizeof(struct sockaddr_pn))
+		return -EINVAL;
+	if (spn->spn_family != AF_PHONET)
+		return -EAFNOSUPPORT;
+
+	lock_sock(sk);
+
+	switch (sock->state) {
+	case SS_UNCONNECTED:
+		sk->sk_state = TCP_CLOSE;
+		break;
+	case SS_CONNECTING:
+		switch (sk->sk_state) {
+		case TCP_SYN_RECV:
+			sock->state = SS_CONNECTED;
+			err = -EISCONN;
+			goto out;
+		case TCP_CLOSE:
+			err = -EALREADY;
+			if (flags & O_NONBLOCK)
+				goto out;
+			goto wait_connect;
+		}
+		break;
+	case SS_CONNECTED:
+		switch (sk->sk_state) {
+		case TCP_SYN_RECV:
+			err = -EISCONN;
+			goto out;
+		case TCP_CLOSE:
+			sock->state = SS_UNCONNECTED;
+			break;
+		}
+		break;
+	case SS_DISCONNECTING:
+	case SS_FREE:
+		break;
+	}
+	sk->sk_state = TCP_CLOSE;
+	sk_stream_kill_queues(sk);
+
+	sock->state = SS_CONNECTING;
+	err = sk->sk_prot->connect(sk, addr, len);
+	if (err < 0) {
+		sock->state = SS_UNCONNECTED;
+		sk->sk_state = TCP_CLOSE;
+		goto out;
+	}
+
+	err = -EINPROGRESS;
+wait_connect:
+	if (sk->sk_state != TCP_SYN_RECV && (flags & O_NONBLOCK))
+		goto out;
+
+	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+	release_sock(sk);
+
+	err = -ERESTARTSYS;
+	timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+			sk->sk_state != TCP_CLOSE,
+			timeo);
+
+	lock_sock(sk);
+	if (timeo < 0)
+		goto out; /* -ERESTARTSYS */
+
+	err = -ETIMEDOUT;
+	if (timeo == 0 && sk->sk_state != TCP_SYN_RECV)
+		goto out;
+
+	if (sk->sk_state != TCP_SYN_RECV) {
+		sock->state = SS_UNCONNECTED;
+		err = sock_error(sk);
+		if (!err)
+			err = -ECONNREFUSED;
+		goto out;
+	}
+	sock->state = SS_CONNECTED;
+	err = 0;
+
+out:
+	release_sock(sk);
+	return err;
+}
+#endif
+
 static int pn_socket_accept(struct socket *sock, struct socket *newsock,
 				int flags)
 {
@@ -281,7 +377,9 @@ static unsigned int pn_socket_poll(struct file *file, struct socket *sock,
 	if (!mask && sk->sk_state == TCP_CLOSE_WAIT)
 		return POLLHUP;
 
-	if (sk->sk_state == TCP_ESTABLISHED && atomic_read(&pn->tx_credits))
+	if (sk->sk_state == TCP_ESTABLISHED &&
+		atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf &&
+		atomic_read(&pn->tx_credits))
 		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 
 	return mask;
@@ -390,7 +488,11 @@ const struct proto_ops phonet_stream_ops = {
 	.owner		= THIS_MODULE,
 	.release	= pn_socket_release,
 	.bind		= pn_socket_bind,
+#ifdef CONFIG_PHONET_PIPECTRLR
+	.connect	= pn_socket_connect,
+#else
 	.connect	= sock_no_connect,
+#endif
 	.socketpair	= sock_no_socketpair,
 	.accept		= pn_socket_accept,
 	.getname	= pn_socket_getname,
@@ -563,3 +665,188 @@ const struct file_operations pn_sock_seq_fops = {
 	.release = seq_release_net,
 };
 #endif
+
+static struct  {
+	struct sock *sk[256];
+} pnres;
+
+/*
+ * Find and hold socket based on resource.
+ */
+struct sock *pn_find_sock_by_res(struct net *net, u8 res)
+{
+	struct sock *sk;
+
+	if (!net_eq(net, &init_net))
+		return NULL;
+
+	rcu_read_lock();
+	sk = rcu_dereference(pnres.sk[res]);
+	if (sk)
+		sock_hold(sk);
+	rcu_read_unlock();
+	return sk;
+}
+
+static DEFINE_MUTEX(resource_mutex);
+
+int pn_sock_bind_res(struct sock *sk, u8 res)
+{
+	int ret = -EADDRINUSE;
+
+	if (!net_eq(sock_net(sk), &init_net))
+		return -ENOIOCTLCMD;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (pn_socket_autobind(sk->sk_socket))
+		return -EAGAIN;
+
+	mutex_lock(&resource_mutex);
+	if (pnres.sk[res] == NULL) {
+		sock_hold(sk);
+		rcu_assign_pointer(pnres.sk[res], sk);
+		ret = 0;
+	}
+	mutex_unlock(&resource_mutex);
+	return ret;
+}
+
+int pn_sock_unbind_res(struct sock *sk, u8 res)
+{
+	int ret = -ENOENT;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&resource_mutex);
+	if (pnres.sk[res] == sk) {
+		rcu_assign_pointer(pnres.sk[res], NULL);
+		ret = 0;
+	}
+	mutex_unlock(&resource_mutex);
+
+	if (ret == 0) {
+		synchronize_rcu();
+		sock_put(sk);
+	}
+	return ret;
+}
+
+void pn_sock_unbind_all_res(struct sock *sk)
+{
+	unsigned res, match = 0;
+
+	mutex_lock(&resource_mutex);
+	for (res = 0; res < 256; res++) {
+		if (pnres.sk[res] == sk) {
+			rcu_assign_pointer(pnres.sk[res], NULL);
+			match++;
+		}
+	}
+	mutex_unlock(&resource_mutex);
+
+	if (match == 0)
+		return;
+	synchronize_rcu();
+	while (match > 0) {
+		sock_put(sk);
+		match--;
+	}
+}
+
+#ifdef CONFIG_PROC_FS
+static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct net *net = seq_file_net(seq);
+	unsigned i;
+
+	if (!net_eq(net, &init_net))
+		return NULL;
+
+	for (i = 0; i < 256; i++) {
+		if (pnres.sk[i] == NULL)
+			continue;
+		if (!pos)
+			return pnres.sk + i;
+		pos--;
+	}
+	return NULL;
+}
+
+static struct sock **pn_res_get_next(struct seq_file *seq, struct sock **sk)
+{
+	struct net *net = seq_file_net(seq);
+	unsigned i;
+
+	BUG_ON(!net_eq(net, &init_net));
+
+	for (i = (sk - pnres.sk) + 1; i < 256; i++)
+		if (pnres.sk[i])
+			return pnres.sk + i;
+	return NULL;
+}
+
+static void *pn_res_seq_start(struct seq_file *seq, loff_t *pos)
+	__acquires(resource_mutex)
+{
+	mutex_lock(&resource_mutex);
+	return *pos ? pn_res_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *pn_res_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock **sk;
+
+	if (v == SEQ_START_TOKEN)
+		sk = pn_res_get_idx(seq, 0);
+	else
+		sk = pn_res_get_next(seq, v);
+	(*pos)++;
+	return sk;
+}
+
+static void pn_res_seq_stop(struct seq_file *seq, void *v)
+	__releases(resource_mutex)
+{
+	mutex_unlock(&resource_mutex);
+}
+
+static int pn_res_seq_show(struct seq_file *seq, void *v)
+{
+	int len;
+
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "%s%n", "rs   uid inode", &len);
+	else {
+		struct sock **psk = v;
+		struct sock *sk = *psk;
+
+		seq_printf(seq, "%02X %5d %lu%n",
+			   (int) (psk - pnres.sk), sock_i_uid(sk),
+			   sock_i_ino(sk), &len);
+	}
+	seq_printf(seq, "%*s\n", 63 - len, "");
+	return 0;
+}
+
+static const struct seq_operations pn_res_seq_ops = {
+	.start = pn_res_seq_start,
+	.next = pn_res_seq_next,
+	.stop = pn_res_seq_stop,
+	.show = pn_res_seq_show,
+};
+
+static int pn_res_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &pn_res_seq_ops,
+				sizeof(struct seq_net_private));
+}
+
+const struct file_operations pn_res_seq_fops = {
+	.owner = THIS_MODULE,
+	.open = pn_res_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release_net,
+};
+#endif
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index aebfecbdb84..bb6ad81b671 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -39,7 +39,15 @@
 #include <net/sock.h>
 
 #include "rds.h"
-#include "rdma.h"
+
+char *rds_str_array(char **array, size_t elements, size_t index)
+{
+	if ((index < elements) && array[index])
+		return array[index];
+	else
+		return "unknown";
+}
+EXPORT_SYMBOL(rds_str_array);
 
 /* this is just used for stats gathering :/ */
 static DEFINE_SPINLOCK(rds_sock_lock);
@@ -62,7 +70,7 @@ static int rds_release(struct socket *sock)
 	struct rds_sock *rs;
 	unsigned long flags;
 
-	if (sk == NULL)
+	if (!sk)
 		goto out;
 
 	rs = rds_sk_to_rs(sk);
@@ -73,7 +81,15 @@ static int rds_release(struct socket *sock)
 	 * with the socket. */
 	rds_clear_recv_queue(rs);
 	rds_cong_remove_socket(rs);
+
+	/*
+	 * the binding lookup hash uses rcu, we need to
+	 * make sure we sychronize_rcu before we free our
+	 * entry
+	 */
 	rds_remove_bound(rs);
+	synchronize_rcu();
+
 	rds_send_drop_to(rs, NULL);
 	rds_rdma_drop_keys(rs);
 	rds_notify_queue_get(rs, NULL);
@@ -83,6 +99,8 @@ static int rds_release(struct socket *sock)
 	rds_sock_count--;
 	spin_unlock_irqrestore(&rds_sock_lock, flags);
 
+	rds_trans_put(rs->rs_transport);
+
 	sock->sk = NULL;
 	sock_put(sk);
 out:
@@ -514,7 +532,7 @@ out:
 	spin_unlock_irqrestore(&rds_sock_lock, flags);
 }
 
-static void __exit rds_exit(void)
+static void rds_exit(void)
 {
 	sock_unregister(rds_family_ops.family);
 	proto_unregister(&rds_proto);
@@ -529,7 +547,7 @@ static void __exit rds_exit(void)
 }
 module_exit(rds_exit);
 
-static int __init rds_init(void)
+static int rds_init(void)
 {
 	int ret;
 
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 5d95fc007f1..2f6b3fcc79f 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -34,45 +34,52 @@
 #include <net/sock.h>
 #include <linux/in.h>
 #include <linux/if_arp.h>
+#include <linux/jhash.h>
 #include "rds.h"
 
-/*
- * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't
- * particularly zippy.
- *
- * This is now called for every incoming frame so we arguably care much more
- * about it than we used to.
- */
+#define BIND_HASH_SIZE 1024
+static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
 static DEFINE_SPINLOCK(rds_bind_lock);
-static struct rb_root rds_bind_tree = RB_ROOT;
 
-static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
-					   struct rds_sock *insert)
+static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
+{
+	return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
+				  (BIND_HASH_SIZE - 1));
+}
+
+static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
+					struct rds_sock *insert)
 {
-	struct rb_node **p = &rds_bind_tree.rb_node;
-	struct rb_node *parent = NULL;
 	struct rds_sock *rs;
+	struct hlist_node *node;
+	struct hlist_head *head = hash_to_bucket(addr, port);
 	u64 cmp;
 	u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
 
-	while (*p) {
-		parent = *p;
-		rs = rb_entry(parent, struct rds_sock, rs_bound_node);
-
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) {
 		cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
 		      be16_to_cpu(rs->rs_bound_port);
 
-		if (needle < cmp)
-			p = &(*p)->rb_left;
-		else if (needle > cmp)
-			p = &(*p)->rb_right;
-		else
+		if (cmp == needle) {
+			rcu_read_unlock();
 			return rs;
+		}
 	}
+	rcu_read_unlock();
 
 	if (insert) {
-		rb_link_node(&insert->rs_bound_node, parent, p);
-		rb_insert_color(&insert->rs_bound_node, &rds_bind_tree);
+		/*
+		 * make sure our addr and port are set before
+		 * we are added to the list, other people
+		 * in rcu will find us as soon as the
+		 * hlist_add_head_rcu is done
+		 */
+		insert->rs_bound_addr = addr;
+		insert->rs_bound_port = port;
+		rds_sock_addref(insert);
+
+		hlist_add_head_rcu(&insert->rs_bound_node, head);
 	}
 	return NULL;
 }
@@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
 struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
 {
 	struct rds_sock *rs;
-	unsigned long flags;
 
-	spin_lock_irqsave(&rds_bind_lock, flags);
-	rs = rds_bind_tree_walk(addr, port, NULL);
+	rs = rds_bind_lookup(addr, port, NULL);
+
 	if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
 		rds_sock_addref(rs);
 	else
 		rs = NULL;
-	spin_unlock_irqrestore(&rds_bind_lock, flags);
 
 	rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
 		ntohs(port));
@@ -121,22 +126,15 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
 	do {
 		if (rover == 0)
 			rover++;
-		if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) {
-			*port = cpu_to_be16(rover);
+		if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
+			*port = rs->rs_bound_port;
 			ret = 0;
+			rdsdebug("rs %p binding to %pI4:%d\n",
+			  rs, &addr, (int)ntohs(*port));
 			break;
 		}
 	} while (rover++ != last);
 
-	if (ret == 0)  {
-		rs->rs_bound_addr = addr;
-		rs->rs_bound_port = *port;
-		rds_sock_addref(rs);
-
-		rdsdebug("rs %p binding to %pI4:%d\n",
-		  rs, &addr, (int)ntohs(*port));
-	}
-
 	spin_unlock_irqrestore(&rds_bind_lock, flags);
 
 	return ret;
@@ -153,7 +151,7 @@ void rds_remove_bound(struct rds_sock *rs)
 		  rs, &rs->rs_bound_addr,
 		  ntohs(rs->rs_bound_port));
 
-		rb_erase(&rs->rs_bound_node, &rds_bind_tree);
+		hlist_del_init_rcu(&rs->rs_bound_node);
 		rds_sock_put(rs);
 		rs->rs_bound_addr = 0;
 	}
@@ -184,7 +182,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		goto out;
 
 	trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
-	if (trans == NULL) {
+	if (!trans) {
 		ret = -EADDRNOTAVAIL;
 		rds_remove_bound(rs);
 		if (printk_ratelimit())
@@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 out:
 	release_sock(sk);
+
+	/* we might have called rds_remove_bound on error */
+	if (ret)
+		synchronize_rcu();
 	return ret;
 }
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 0871a29f078..75ea686f27d 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -141,7 +141,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
 	unsigned long flags;
 
 	map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
-	if (map == NULL)
+	if (!map)
 		return NULL;
 
 	map->m_addr = addr;
@@ -159,7 +159,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
 	ret = rds_cong_tree_walk(addr, map);
 	spin_unlock_irqrestore(&rds_cong_lock, flags);
 
-	if (ret == NULL) {
+	if (!ret) {
 		ret = map;
 		map = NULL;
 	}
@@ -205,7 +205,7 @@ int rds_cong_get_maps(struct rds_connection *conn)
 	conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
 	conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
 
-	if (conn->c_lcong == NULL || conn->c_fcong == NULL)
+	if (!(conn->c_lcong && conn->c_fcong))
 		return -ENOMEM;
 
 	return 0;
@@ -221,7 +221,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
 	list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
 		if (!test_and_set_bit(0, &conn->c_map_queued)) {
 			rds_stats_inc(s_cong_update_queued);
-			queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+			rds_send_xmit(conn);
 		}
 	}
 
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 7619b671ca2..9334d892366 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -37,7 +37,6 @@
 
 #include "rds.h"
 #include "loop.h"
-#include "rdma.h"
 
 #define RDS_CONNECTION_HASH_BITS 12
 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
@@ -63,18 +62,7 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
 		var |= RDS_INFO_CONNECTION_FLAG_##suffix;	\
 } while (0)
 
-static inline int rds_conn_is_sending(struct rds_connection *conn)
-{
-	int ret = 0;
-
-	if (!mutex_trylock(&conn->c_send_lock))
-		ret = 1;
-	else
-		mutex_unlock(&conn->c_send_lock);
-
-	return ret;
-}
-
+/* rcu read lock must be held or the connection spinlock */
 static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
 					      __be32 laddr, __be32 faddr,
 					      struct rds_transport *trans)
@@ -82,7 +70,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
 	struct rds_connection *conn, *ret = NULL;
 	struct hlist_node *pos;
 
-	hlist_for_each_entry(conn, pos, head, c_hash_node) {
+	hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
 		if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
 				conn->c_trans == trans) {
 			ret = conn;
@@ -100,7 +88,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
  * and receiving over this connection again in the future.  It is up to
  * the transport to have serialized this call with its send and recv.
  */
-void rds_conn_reset(struct rds_connection *conn)
+static void rds_conn_reset(struct rds_connection *conn)
 {
 	rdsdebug("connection %pI4 to %pI4 reset\n",
 	  &conn->c_laddr, &conn->c_faddr);
@@ -129,10 +117,11 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 {
 	struct rds_connection *conn, *parent = NULL;
 	struct hlist_head *head = rds_conn_bucket(laddr, faddr);
+	struct rds_transport *loop_trans;
 	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(&rds_conn_lock, flags);
+	rcu_read_lock();
 	conn = rds_conn_lookup(head, laddr, faddr, trans);
 	if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
 	    !is_outgoing) {
@@ -143,12 +132,12 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 		parent = conn;
 		conn = parent->c_passive;
 	}
-	spin_unlock_irqrestore(&rds_conn_lock, flags);
+	rcu_read_unlock();
 	if (conn)
 		goto out;
 
 	conn = kmem_cache_zalloc(rds_conn_slab, gfp);
-	if (conn == NULL) {
+	if (!conn) {
 		conn = ERR_PTR(-ENOMEM);
 		goto out;
 	}
@@ -159,7 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 	spin_lock_init(&conn->c_lock);
 	conn->c_next_tx_seq = 1;
 
-	mutex_init(&conn->c_send_lock);
+	init_waitqueue_head(&conn->c_waitq);
 	INIT_LIST_HEAD(&conn->c_send_queue);
 	INIT_LIST_HEAD(&conn->c_retrans);
 
@@ -175,7 +164,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 	 * can bind to the destination address then we'd rather the messages
 	 * flow through loopback rather than either transport.
 	 */
-	if (rds_trans_get_preferred(faddr)) {
+	loop_trans = rds_trans_get_preferred(faddr);
+	if (loop_trans) {
+		rds_trans_put(loop_trans);
 		conn->c_loopback = 1;
 		if (is_outgoing && trans->t_prefer_loopback) {
 			/* "outgoing" connection - and the transport
@@ -238,7 +229,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
 			kmem_cache_free(rds_conn_slab, conn);
 			conn = found;
 		} else {
-			hlist_add_head(&conn->c_hash_node, head);
+			hlist_add_head_rcu(&conn->c_hash_node, head);
 			rds_cong_add_conn(conn);
 			rds_conn_count++;
 		}
@@ -263,21 +254,91 @@ struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
 }
 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
 
+void rds_conn_shutdown(struct rds_connection *conn)
+{
+	/* shut it down unless it's down already */
+	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+		/*
+		 * Quiesce the connection mgmt handlers before we start tearing
+		 * things down. We don't hold the mutex for the entire
+		 * duration of the shutdown operation, else we may be
+		 * deadlocking with the CM handler. Instead, the CM event
+		 * handler is supposed to check for state DISCONNECTING
+		 */
+		mutex_lock(&conn->c_cm_lock);
+		if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
+		 && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
+			rds_conn_error(conn, "shutdown called in state %d\n",
+					atomic_read(&conn->c_state));
+			mutex_unlock(&conn->c_cm_lock);
+			return;
+		}
+		mutex_unlock(&conn->c_cm_lock);
+
+		wait_event(conn->c_waitq,
+			   !test_bit(RDS_IN_XMIT, &conn->c_flags));
+
+		conn->c_trans->conn_shutdown(conn);
+		rds_conn_reset(conn);
+
+		if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
+			/* This can happen - eg when we're in the middle of tearing
+			 * down the connection, and someone unloads the rds module.
+			 * Quite reproduceable with loopback connections.
+			 * Mostly harmless.
+			 */
+			rds_conn_error(conn,
+				"%s: failed to transition to state DOWN, "
+				"current state is %d\n",
+				__func__,
+				atomic_read(&conn->c_state));
+			return;
+		}
+	}
+
+	/* Then reconnect if it's still live.
+	 * The passive side of an IB loopback connection is never added
+	 * to the conn hash, so we never trigger a reconnect on this
+	 * conn - the reconnect is always triggered by the active peer. */
+	cancel_delayed_work_sync(&conn->c_conn_w);
+	rcu_read_lock();
+	if (!hlist_unhashed(&conn->c_hash_node)) {
+		rcu_read_unlock();
+		rds_queue_reconnect(conn);
+	} else {
+		rcu_read_unlock();
+	}
+}
+
+/*
+ * Stop and free a connection.
+ *
+ * This can only be used in very limited circumstances.  It assumes that once
+ * the conn has been shutdown that no one else is referencing the connection.
+ * We can only ensure this in the rmmod path in the current code.
+ */
 void rds_conn_destroy(struct rds_connection *conn)
 {
 	struct rds_message *rm, *rtmp;
+	unsigned long flags;
 
 	rdsdebug("freeing conn %p for %pI4 -> "
 		 "%pI4\n", conn, &conn->c_laddr,
 		 &conn->c_faddr);
 
-	hlist_del_init(&conn->c_hash_node);
+	/* Ensure conn will not be scheduled for reconnect */
+	spin_lock_irq(&rds_conn_lock);
+	hlist_del_init_rcu(&conn->c_hash_node);
+	spin_unlock_irq(&rds_conn_lock);
+	synchronize_rcu();
 
-	/* wait for the rds thread to shut it down */
-	atomic_set(&conn->c_state, RDS_CONN_ERROR);
-	cancel_delayed_work(&conn->c_conn_w);
-	queue_work(rds_wq, &conn->c_down_w);
-	flush_workqueue(rds_wq);
+	/* shut the connection down */
+	rds_conn_drop(conn);
+	flush_work(&conn->c_down_w);
+
+	/* make sure lingering queued work won't try to ref the conn */
+	cancel_delayed_work_sync(&conn->c_send_w);
+	cancel_delayed_work_sync(&conn->c_recv_w);
 
 	/* tear down queued messages */
 	list_for_each_entry_safe(rm, rtmp,
@@ -302,7 +363,9 @@ void rds_conn_destroy(struct rds_connection *conn)
 	BUG_ON(!list_empty(&conn->c_retrans));
 	kmem_cache_free(rds_conn_slab, conn);
 
+	spin_lock_irqsave(&rds_conn_lock, flags);
 	rds_conn_count--;
+	spin_unlock_irqrestore(&rds_conn_lock, flags);
 }
 EXPORT_SYMBOL_GPL(rds_conn_destroy);
 
@@ -316,23 +379,23 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
 	struct list_head *list;
 	struct rds_connection *conn;
 	struct rds_message *rm;
-	unsigned long flags;
 	unsigned int total = 0;
+	unsigned long flags;
 	size_t i;
 
 	len /= sizeof(struct rds_info_message);
 
-	spin_lock_irqsave(&rds_conn_lock, flags);
+	rcu_read_lock();
 
 	for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
 	     i++, head++) {
-		hlist_for_each_entry(conn, pos, head, c_hash_node) {
+		hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
 			if (want_send)
 				list = &conn->c_send_queue;
 			else
 				list = &conn->c_retrans;
 
-			spin_lock(&conn->c_lock);
+			spin_lock_irqsave(&conn->c_lock, flags);
 
 			/* XXX too lazy to maintain counts.. */
 			list_for_each_entry(rm, list, m_conn_item) {
@@ -343,11 +406,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
 							  conn->c_faddr, 0);
 			}
 
-			spin_unlock(&conn->c_lock);
+			spin_unlock_irqrestore(&conn->c_lock, flags);
 		}
 	}
-
-	spin_unlock_irqrestore(&rds_conn_lock, flags);
+	rcu_read_unlock();
 
 	lens->nr = total;
 	lens->each = sizeof(struct rds_info_message);
@@ -377,19 +439,17 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
 	uint64_t buffer[(item_len + 7) / 8];
 	struct hlist_head *head;
 	struct hlist_node *pos;
-	struct hlist_node *tmp;
 	struct rds_connection *conn;
-	unsigned long flags;
 	size_t i;
 
-	spin_lock_irqsave(&rds_conn_lock, flags);
+	rcu_read_lock();
 
 	lens->nr = 0;
 	lens->each = item_len;
 
 	for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
 	     i++, head++) {
-		hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) {
+		hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
 
 			/* XXX no c_lock usage.. */
 			if (!visitor(conn, buffer))
@@ -405,8 +465,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
 			lens->nr++;
 		}
 	}
-
-	spin_unlock_irqrestore(&rds_conn_lock, flags);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
 
@@ -423,8 +482,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn,
 		sizeof(cinfo->transport));
 	cinfo->flags = 0;
 
-	rds_conn_info_set(cinfo->flags,
-			  rds_conn_is_sending(conn), SENDING);
+	rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags),
+			  SENDING);
 	/* XXX Future: return the state rather than these funky bits */
 	rds_conn_info_set(cinfo->flags,
 			  atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
@@ -444,12 +503,12 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
 				sizeof(struct rds_info_connection));
 }
 
-int __init rds_conn_init(void)
+int rds_conn_init(void)
 {
 	rds_conn_slab = kmem_cache_create("rds_connection",
 					  sizeof(struct rds_connection),
 					  0, 0, NULL);
-	if (rds_conn_slab == NULL)
+	if (!rds_conn_slab)
 		return -ENOMEM;
 
 	rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
@@ -487,6 +546,18 @@ void rds_conn_drop(struct rds_connection *conn)
 EXPORT_SYMBOL_GPL(rds_conn_drop);
 
 /*
+ * If the connection is down, trigger a connect. We may have scheduled a
+ * delayed reconnect however - in this case we should not interfere.
+ */
+void rds_conn_connect_if_down(struct rds_connection *conn)
+{
+	if (rds_conn_state(conn) == RDS_CONN_DOWN &&
+	    !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+		queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+}
+EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
+
+/*
  * An error occurred on the connection
  */
 void
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 8f2d6dd7700..4123967d4d6 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -42,7 +42,7 @@
 #include "rds.h"
 #include "ib.h"
 
-unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
+static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
 unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
 unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
 
@@ -53,13 +53,72 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
 module_param(rds_ib_retry_count, int, 0444);
 MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
 
+/*
+ * we have a clumsy combination of RCU and a rwsem protecting this list
+ * because it is used both in the get_mr fast path and while blocking in
+ * the FMR flushing path.
+ */
+DECLARE_RWSEM(rds_ib_devices_lock);
 struct list_head rds_ib_devices;
 
 /* NOTE: if also grabbing ibdev lock, grab this first */
 DEFINE_SPINLOCK(ib_nodev_conns_lock);
 LIST_HEAD(ib_nodev_conns);
 
-void rds_ib_add_one(struct ib_device *device)
+static void rds_ib_nodev_connect(void)
+{
+	struct rds_ib_connection *ic;
+
+	spin_lock(&ib_nodev_conns_lock);
+	list_for_each_entry(ic, &ib_nodev_conns, ib_node)
+		rds_conn_connect_if_down(ic->conn);
+	spin_unlock(&ib_nodev_conns_lock);
+}
+
+static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
+{
+	struct rds_ib_connection *ic;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rds_ibdev->spinlock, flags);
+	list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
+		rds_conn_drop(ic->conn);
+	spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
+}
+
+/*
+ * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
+ * from interrupt context so we push freing off into a work struct in krdsd.
+ */
+static void rds_ib_dev_free(struct work_struct *work)
+{
+	struct rds_ib_ipaddr *i_ipaddr, *i_next;
+	struct rds_ib_device *rds_ibdev = container_of(work,
+					struct rds_ib_device, free_work);
+
+	if (rds_ibdev->mr_pool)
+		rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
+	if (rds_ibdev->mr)
+		ib_dereg_mr(rds_ibdev->mr);
+	if (rds_ibdev->pd)
+		ib_dealloc_pd(rds_ibdev->pd);
+
+	list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
+		list_del(&i_ipaddr->list);
+		kfree(i_ipaddr);
+	}
+
+	kfree(rds_ibdev);
+}
+
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
+{
+	BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
+	if (atomic_dec_and_test(&rds_ibdev->refcount))
+		queue_work(rds_wq, &rds_ibdev->free_work);
+}
+
+static void rds_ib_add_one(struct ib_device *device)
 {
 	struct rds_ib_device *rds_ibdev;
 	struct ib_device_attr *dev_attr;
@@ -77,11 +136,14 @@ void rds_ib_add_one(struct ib_device *device)
 		goto free_attr;
 	}
 
-	rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL);
+	rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
+				 ibdev_to_node(device));
 	if (!rds_ibdev)
 		goto free_attr;
 
 	spin_lock_init(&rds_ibdev->spinlock);
+	atomic_set(&rds_ibdev->refcount, 1);
+	INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
 
 	rds_ibdev->max_wrs = dev_attr->max_qp_wr;
 	rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
@@ -91,68 +153,107 @@ void rds_ib_add_one(struct ib_device *device)
 			min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
 			fmr_pool_size;
 
+	rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
+	rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
+
 	rds_ibdev->dev = device;
 	rds_ibdev->pd = ib_alloc_pd(device);
-	if (IS_ERR(rds_ibdev->pd))
-		goto free_dev;
+	if (IS_ERR(rds_ibdev->pd)) {
+		rds_ibdev->pd = NULL;
+		goto put_dev;
+	}
 
-	rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
-				      IB_ACCESS_LOCAL_WRITE);
-	if (IS_ERR(rds_ibdev->mr))
-		goto err_pd;
+	rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(rds_ibdev->mr)) {
+		rds_ibdev->mr = NULL;
+		goto put_dev;
+	}
 
 	rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
 	if (IS_ERR(rds_ibdev->mr_pool)) {
 		rds_ibdev->mr_pool = NULL;
-		goto err_mr;
+		goto put_dev;
 	}
 
 	INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
 	INIT_LIST_HEAD(&rds_ibdev->conn_list);
-	list_add_tail(&rds_ibdev->list, &rds_ib_devices);
+
+	down_write(&rds_ib_devices_lock);
+	list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
+	up_write(&rds_ib_devices_lock);
+	atomic_inc(&rds_ibdev->refcount);
 
 	ib_set_client_data(device, &rds_ib_client, rds_ibdev);
+	atomic_inc(&rds_ibdev->refcount);
 
-	goto free_attr;
+	rds_ib_nodev_connect();
 
-err_mr:
-	ib_dereg_mr(rds_ibdev->mr);
-err_pd:
-	ib_dealloc_pd(rds_ibdev->pd);
-free_dev:
-	kfree(rds_ibdev);
+put_dev:
+	rds_ib_dev_put(rds_ibdev);
 free_attr:
 	kfree(dev_attr);
 }
 
-void rds_ib_remove_one(struct ib_device *device)
+/*
+ * New connections use this to find the device to associate with the
+ * connection.  It's not in the fast path so we're not concerned about the
+ * performance of the IB call.  (As of this writing, it uses an interrupt
+ * blocking spinlock to serialize walking a per-device list of all registered
+ * clients.)
+ *
+ * RCU is used to handle incoming connections racing with device teardown.
+ * Rather than use a lock to serialize removal from the client_data and
+ * getting a new reference, we use an RCU grace period.  The destruction
+ * path removes the device from client_data and then waits for all RCU
+ * readers to finish.
+ *
+ * A new connection can get NULL from this if its arriving on a
+ * device that is in the process of being removed.
+ */
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
 {
 	struct rds_ib_device *rds_ibdev;
-	struct rds_ib_ipaddr *i_ipaddr, *i_next;
 
+	rcu_read_lock();
 	rds_ibdev = ib_get_client_data(device, &rds_ib_client);
-	if (!rds_ibdev)
-		return;
+	if (rds_ibdev)
+		atomic_inc(&rds_ibdev->refcount);
+	rcu_read_unlock();
+	return rds_ibdev;
+}
 
-	list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
-		list_del(&i_ipaddr->list);
-		kfree(i_ipaddr);
-	}
+/*
+ * The IB stack is letting us know that a device is going away.  This can
+ * happen if the underlying HCA driver is removed or if PCI hotplug is removing
+ * the pci function, for example.
+ *
+ * This can be called at any time and can be racing with any other RDS path.
+ */
+static void rds_ib_remove_one(struct ib_device *device)
+{
+	struct rds_ib_device *rds_ibdev;
 
-	rds_ib_destroy_conns(rds_ibdev);
+	rds_ibdev = ib_get_client_data(device, &rds_ib_client);
+	if (!rds_ibdev)
+		return;
 
-	if (rds_ibdev->mr_pool)
-		rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
+	rds_ib_dev_shutdown(rds_ibdev);
 
-	ib_dereg_mr(rds_ibdev->mr);
+	/* stop connection attempts from getting a reference to this device. */
+	ib_set_client_data(device, &rds_ib_client, NULL);
 
-	while (ib_dealloc_pd(rds_ibdev->pd)) {
-		rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd);
-		msleep(1);
-	}
+	down_write(&rds_ib_devices_lock);
+	list_del_rcu(&rds_ibdev->list);
+	up_write(&rds_ib_devices_lock);
 
-	list_del(&rds_ibdev->list);
-	kfree(rds_ibdev);
+	/*
+	 * This synchronize rcu is waiting for readers of both the ib
+	 * client data and the devices list to finish before we drop
+	 * both of those references.
+	 */
+	synchronize_rcu();
+	rds_ib_dev_put(rds_ibdev);
+	rds_ib_dev_put(rds_ibdev);
 }
 
 struct ib_client rds_ib_client = {
@@ -186,7 +287,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
 		rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
 		rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
 
-		rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+		rds_ibdev = ic->rds_ibdev;
 		iinfo->max_send_wr = ic->i_send_ring.w_nr;
 		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
 		iinfo->max_send_sge = rds_ibdev->max_sge;
@@ -248,29 +349,36 @@ static int rds_ib_laddr_check(__be32 addr)
 	return ret;
 }
 
+static void rds_ib_unregister_client(void)
+{
+	ib_unregister_client(&rds_ib_client);
+	/* wait for rds_ib_dev_free() to complete */
+	flush_workqueue(rds_wq);
+}
+
 void rds_ib_exit(void)
 {
 	rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+	rds_ib_unregister_client();
 	rds_ib_destroy_nodev_conns();
-	ib_unregister_client(&rds_ib_client);
 	rds_ib_sysctl_exit();
 	rds_ib_recv_exit();
 	rds_trans_unregister(&rds_ib_transport);
+	rds_ib_fmr_exit();
 }
 
 struct rds_transport rds_ib_transport = {
 	.laddr_check		= rds_ib_laddr_check,
 	.xmit_complete		= rds_ib_xmit_complete,
 	.xmit			= rds_ib_xmit,
-	.xmit_cong_map		= NULL,
 	.xmit_rdma		= rds_ib_xmit_rdma,
+	.xmit_atomic		= rds_ib_xmit_atomic,
 	.recv			= rds_ib_recv,
 	.conn_alloc		= rds_ib_conn_alloc,
 	.conn_free		= rds_ib_conn_free,
 	.conn_connect		= rds_ib_conn_connect,
 	.conn_shutdown		= rds_ib_conn_shutdown,
 	.inc_copy_to_user	= rds_ib_inc_copy_to_user,
-	.inc_purge		= rds_ib_inc_purge,
 	.inc_free		= rds_ib_inc_free,
 	.cm_initiate_connect	= rds_ib_cm_initiate_connect,
 	.cm_handle_connect	= rds_ib_cm_handle_connect,
@@ -286,16 +394,20 @@ struct rds_transport rds_ib_transport = {
 	.t_type			= RDS_TRANS_IB
 };
 
-int __init rds_ib_init(void)
+int rds_ib_init(void)
 {
 	int ret;
 
 	INIT_LIST_HEAD(&rds_ib_devices);
 
-	ret = ib_register_client(&rds_ib_client);
+	ret = rds_ib_fmr_init();
 	if (ret)
 		goto out;
 
+	ret = ib_register_client(&rds_ib_client);
+	if (ret)
+		goto out_fmr_exit;
+
 	ret = rds_ib_sysctl_init();
 	if (ret)
 		goto out_ibreg;
@@ -317,7 +429,9 @@ out_recv:
 out_sysctl:
 	rds_ib_sysctl_exit();
 out_ibreg:
-	ib_unregister_client(&rds_ib_client);
+	rds_ib_unregister_client();
+out_fmr_exit:
+	rds_ib_fmr_exit();
 out:
 	return ret;
 }
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 64df4e79b29..e34ad032b66 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -3,11 +3,13 @@
 
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
 #include "rds.h"
 #include "rdma_transport.h"
 
 #define RDS_FMR_SIZE			256
-#define RDS_FMR_POOL_SIZE		4096
+#define RDS_FMR_POOL_SIZE		8192
 
 #define RDS_IB_MAX_SGE			8
 #define RDS_IB_RECV_SGE 		2
@@ -19,6 +21,9 @@
 
 #define RDS_IB_SUPPORTED_PROTOCOLS	0x00000003	/* minor versions supported */
 
+#define RDS_IB_RECYCLE_BATCH_COUNT	32
+
+extern struct rw_semaphore rds_ib_devices_lock;
 extern struct list_head rds_ib_devices;
 
 /*
@@ -26,20 +31,29 @@ extern struct list_head rds_ib_devices;
  * try and minimize the amount of memory tied up both the device and
  * socket receive queues.
  */
-/* page offset of the final full frag that fits in the page */
-#define RDS_PAGE_LAST_OFF (((PAGE_SIZE  / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
 struct rds_page_frag {
 	struct list_head	f_item;
-	struct page		*f_page;
-	unsigned long		f_offset;
-	dma_addr_t 		f_mapped;
+	struct list_head	f_cache_entry;
+	struct scatterlist	f_sg;
 };
 
 struct rds_ib_incoming {
 	struct list_head	ii_frags;
+	struct list_head	ii_cache_entry;
 	struct rds_incoming	ii_inc;
 };
 
+struct rds_ib_cache_head {
+	struct list_head *first;
+	unsigned long count;
+};
+
+struct rds_ib_refill_cache {
+	struct rds_ib_cache_head *percpu;
+	struct list_head	 *xfer;
+	struct list_head	 *ready;
+};
+
 struct rds_ib_connect_private {
 	/* Add new fields at the end, and don't permute existing fields. */
 	__be32			dp_saddr;
@@ -53,8 +67,7 @@ struct rds_ib_connect_private {
 };
 
 struct rds_ib_send_work {
-	struct rds_message	*s_rm;
-	struct rds_rdma_op	*s_op;
+	void			*s_op;
 	struct ib_send_wr	s_wr;
 	struct ib_sge		s_sge[RDS_IB_MAX_SGE];
 	unsigned long		s_queued;
@@ -92,10 +105,11 @@ struct rds_ib_connection {
 
 	/* tx */
 	struct rds_ib_work_ring	i_send_ring;
-	struct rds_message	*i_rm;
+	struct rm_data_op	*i_data_op;
 	struct rds_header	*i_send_hdrs;
 	u64			i_send_hdrs_dma;
 	struct rds_ib_send_work *i_sends;
+	atomic_t		i_signaled_sends;
 
 	/* rx */
 	struct tasklet_struct	i_recv_tasklet;
@@ -106,8 +120,9 @@ struct rds_ib_connection {
 	struct rds_header	*i_recv_hdrs;
 	u64			i_recv_hdrs_dma;
 	struct rds_ib_recv_work *i_recvs;
-	struct rds_page_frag	i_frag;
 	u64			i_ack_recv;	/* last ACK received */
+	struct rds_ib_refill_cache i_cache_incs;
+	struct rds_ib_refill_cache i_cache_frags;
 
 	/* sending acks */
 	unsigned long		i_ack_flags;
@@ -138,7 +153,6 @@ struct rds_ib_connection {
 
 	/* Batched completions */
 	unsigned int		i_unsignaled_wrs;
-	long			i_unsignaled_bytes;
 };
 
 /* This assumes that atomic_t is at least 32 bits */
@@ -164,9 +178,17 @@ struct rds_ib_device {
 	unsigned int		max_fmrs;
 	int			max_sge;
 	unsigned int		max_wrs;
+	unsigned int		max_initiator_depth;
+	unsigned int		max_responder_resources;
 	spinlock_t		spinlock;	/* protect the above */
+	atomic_t		refcount;
+	struct work_struct	free_work;
 };
 
+#define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus)
+#define ibdev_to_node(ibdev) pcidev_to_node(to_pci_dev(ibdev->dma_device))
+#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
+
 /* bits for i_ack_flags */
 #define IB_ACK_IN_FLIGHT	0
 #define IB_ACK_REQUESTED	1
@@ -202,6 +224,8 @@ struct rds_ib_statistics {
 	uint64_t	s_ib_rdma_mr_pool_flush;
 	uint64_t	s_ib_rdma_mr_pool_wait;
 	uint64_t	s_ib_rdma_mr_pool_depleted;
+	uint64_t	s_ib_atomic_cswp;
+	uint64_t	s_ib_atomic_fadd;
 };
 
 extern struct workqueue_struct *rds_ib_wq;
@@ -241,11 +265,10 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
 
 /* ib.c */
 extern struct rds_transport rds_ib_transport;
-extern void rds_ib_add_one(struct ib_device *device);
-extern void rds_ib_remove_one(struct ib_device *device);
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
 extern struct ib_client rds_ib_client;
 
-extern unsigned int fmr_pool_size;
 extern unsigned int fmr_message_size;
 extern unsigned int rds_ib_retry_count;
 
@@ -258,7 +281,7 @@ void rds_ib_conn_free(void *arg);
 int rds_ib_conn_connect(struct rds_connection *conn);
 void rds_ib_conn_shutdown(struct rds_connection *conn);
 void rds_ib_state_change(struct sock *sk);
-int __init rds_ib_listen_init(void);
+int rds_ib_listen_init(void);
 void rds_ib_listen_stop(void);
 void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
@@ -275,15 +298,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
 int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
 void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
 void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
-void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock);
-static inline void rds_ib_destroy_nodev_conns(void)
-{
-	__rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock);
-}
-static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev)
-{
-	__rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock);
-}
+void rds_ib_destroy_nodev_conns(void);
 struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
 void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
@@ -292,14 +307,16 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 void rds_ib_sync_mr(void *trans_private, int dir);
 void rds_ib_free_mr(void *trans_private, int invalidate);
 void rds_ib_flush_mrs(void);
+int rds_ib_fmr_init(void);
+void rds_ib_fmr_exit(void);
 
 /* ib_recv.c */
-int __init rds_ib_recv_init(void);
+int rds_ib_recv_init(void);
 void rds_ib_recv_exit(void);
 int rds_ib_recv(struct rds_connection *conn);
-int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
-		       gfp_t page_gfp, int prefill);
-void rds_ib_inc_purge(struct rds_incoming *inc);
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
 void rds_ib_inc_free(struct rds_incoming *inc);
 int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
 			     size_t size);
@@ -325,17 +342,19 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
 extern wait_queue_head_t rds_ib_ring_empty_wait;
 
 /* ib_send.c */
+char *rds_ib_wc_status_str(enum ib_wc_status status);
 void rds_ib_xmit_complete(struct rds_connection *conn);
 int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		unsigned int hdr_off, unsigned int sg, unsigned int off);
 void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
 void rds_ib_send_init_ring(struct rds_ib_connection *ic);
 void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
-int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
 void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
 void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
 int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
 			     u32 *adv_credits, int need_posted, int max_posted);
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
 
 /* ib_stats.c */
 DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
@@ -344,7 +363,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
 				    unsigned int avail);
 
 /* ib_sysctl.c */
-int __init rds_ib_sysctl_init(void);
+int rds_ib_sysctl_init(void);
 void rds_ib_sysctl_exit(void);
 extern unsigned long rds_ib_sysctl_max_send_wr;
 extern unsigned long rds_ib_sysctl_max_recv_wr;
@@ -352,30 +371,5 @@ extern unsigned long rds_ib_sysctl_max_unsig_wrs;
 extern unsigned long rds_ib_sysctl_max_unsig_bytes;
 extern unsigned long rds_ib_sysctl_max_recv_allocation;
 extern unsigned int rds_ib_sysctl_flow_control;
-extern ctl_table rds_ib_sysctl_table[];
-
-/*
- * Helper functions for getting/setting the header and data SGEs in
- * RDS packets (not RDMA)
- *
- * From version 3.1 onwards, header is in front of data in the sge.
- */
-static inline struct ib_sge *
-rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
-{
-	if (ic->conn->c_version > RDS_PROTOCOL_3_0)
-		return &sge[0];
-	else
-		return &sge[1];
-}
-
-static inline struct ib_sge *
-rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
-{
-	if (ic->conn->c_version > RDS_PROTOCOL_3_0)
-		return &sge[1];
-	else
-		return &sge[0];
-}
 
 #endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index f68832798db..ee369d201a6 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -38,6 +38,36 @@
 #include "rds.h"
 #include "ib.h"
 
+static char *rds_ib_event_type_strings[] = {
+#define RDS_IB_EVENT_STRING(foo) \
+		[IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
+	RDS_IB_EVENT_STRING(CQ_ERR),
+	RDS_IB_EVENT_STRING(QP_FATAL),
+	RDS_IB_EVENT_STRING(QP_REQ_ERR),
+	RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
+	RDS_IB_EVENT_STRING(COMM_EST),
+	RDS_IB_EVENT_STRING(SQ_DRAINED),
+	RDS_IB_EVENT_STRING(PATH_MIG),
+	RDS_IB_EVENT_STRING(PATH_MIG_ERR),
+	RDS_IB_EVENT_STRING(DEVICE_FATAL),
+	RDS_IB_EVENT_STRING(PORT_ACTIVE),
+	RDS_IB_EVENT_STRING(PORT_ERR),
+	RDS_IB_EVENT_STRING(LID_CHANGE),
+	RDS_IB_EVENT_STRING(PKEY_CHANGE),
+	RDS_IB_EVENT_STRING(SM_CHANGE),
+	RDS_IB_EVENT_STRING(SRQ_ERR),
+	RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
+	RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
+	RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
+#undef RDS_IB_EVENT_STRING
+};
+
+static char *rds_ib_event_str(enum ib_event_type type)
+{
+	return rds_str_array(rds_ib_event_type_strings,
+			     ARRAY_SIZE(rds_ib_event_type_strings), type);
+};
+
 /*
  * Set the selected protocol version
  */
@@ -95,7 +125,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 {
 	const struct rds_ib_connect_private *dp = NULL;
 	struct rds_ib_connection *ic = conn->c_transport_data;
-	struct rds_ib_device *rds_ibdev;
 	struct ib_qp_attr qp_attr;
 	int err;
 
@@ -111,11 +140,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 		}
 	}
 
-	printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
-			&conn->c_faddr,
-			RDS_PROTOCOL_MAJOR(conn->c_version),
-			RDS_PROTOCOL_MINOR(conn->c_version),
-			ic->i_flowctl ? ", flow control" : "");
+	if (conn->c_version < RDS_PROTOCOL(3,1)) {
+		printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
+		       " no longer supported\n",
+		       &conn->c_faddr,
+		       RDS_PROTOCOL_MAJOR(conn->c_version),
+		       RDS_PROTOCOL_MINOR(conn->c_version));
+		rds_conn_destroy(conn);
+		return;
+	} else {
+		printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
+		       &conn->c_faddr,
+		       RDS_PROTOCOL_MAJOR(conn->c_version),
+		       RDS_PROTOCOL_MINOR(conn->c_version),
+		       ic->i_flowctl ? ", flow control" : "");
+	}
 
 	/*
 	 * Init rings and fill recv. this needs to wait until protocol negotiation
@@ -125,7 +164,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 	rds_ib_recv_init_ring(ic);
 	/* Post receive buffers - as a side effect, this will update
 	 * the posted credit count. */
-	rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+	rds_ib_recv_refill(conn, 1);
 
 	/* Tune RNR behavior */
 	rds_ib_tune_rnr(ic, &qp_attr);
@@ -135,12 +174,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 	if (err)
 		printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
 
-	/* update ib_device with this local ipaddr & conn */
-	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
-	err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
+	/* update ib_device with this local ipaddr */
+	err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
 	if (err)
-		printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
-	rds_ib_add_conn(rds_ibdev, conn);
+		printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
+			err);
 
 	/* If the peer gave us the last packet it saw, process this as if
 	 * we had received a regular ACK. */
@@ -153,18 +191,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
 			struct rdma_conn_param *conn_param,
 			struct rds_ib_connect_private *dp,
-			u32 protocol_version)
+			u32 protocol_version,
+			u32 max_responder_resources,
+			u32 max_initiator_depth)
 {
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
+
 	memset(conn_param, 0, sizeof(struct rdma_conn_param));
-	/* XXX tune these? */
-	conn_param->responder_resources = 1;
-	conn_param->initiator_depth = 1;
+
+	conn_param->responder_resources =
+		min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
+	conn_param->initiator_depth =
+		min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
 	conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
 	conn_param->rnr_retry_count = 7;
 
 	if (dp) {
-		struct rds_ib_connection *ic = conn->c_transport_data;
-
 		memset(dp, 0, sizeof(*dp));
 		dp->dp_saddr = conn->c_laddr;
 		dp->dp_daddr = conn->c_faddr;
@@ -189,7 +232,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
 
 static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
 {
-	rdsdebug("event %u data %p\n", event->event, data);
+	rdsdebug("event %u (%s) data %p\n",
+		 event->event, rds_ib_event_str(event->event), data);
 }
 
 static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
@@ -197,16 +241,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
 	struct rds_connection *conn = data;
 	struct rds_ib_connection *ic = conn->c_transport_data;
 
-	rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
+	rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
+		 rds_ib_event_str(event->event));
 
 	switch (event->event) {
 	case IB_EVENT_COMM_EST:
 		rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
 		break;
 	default:
-		rdsdebug("Fatal QP Event %u "
+		rdsdebug("Fatal QP Event %u (%s) "
 			"- connection %pI4->%pI4, reconnecting\n",
-			event->event, &conn->c_laddr, &conn->c_faddr);
+			event->event, rds_ib_event_str(event->event),
+			&conn->c_laddr, &conn->c_faddr);
 		rds_conn_drop(conn);
 		break;
 	}
@@ -224,18 +270,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	struct rds_ib_device *rds_ibdev;
 	int ret;
 
-	/* rds_ib_add_one creates a rds_ib_device object per IB device,
-	 * and allocates a protection domain, memory range and FMR pool
-	 * for each.  If that fails for any reason, it will not register
-	 * the rds_ibdev at all.
+	/*
+	 * It's normal to see a null device if an incoming connection races
+	 * with device removal, so we don't print a warning.
 	 */
-	rds_ibdev = ib_get_client_data(dev, &rds_ib_client);
-	if (rds_ibdev == NULL) {
-		if (printk_ratelimit())
-			printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
-					dev->name);
+	rds_ibdev = rds_ib_get_client_data(dev);
+	if (!rds_ibdev)
 		return -EOPNOTSUPP;
-	}
+
+	/* add the conn now so that connection establishment has the dev */
+	rds_ib_add_conn(rds_ibdev, conn);
 
 	if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
 		rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
@@ -306,7 +350,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 					   ic->i_send_ring.w_nr *
 						sizeof(struct rds_header),
 					   &ic->i_send_hdrs_dma, GFP_KERNEL);
-	if (ic->i_send_hdrs == NULL) {
+	if (!ic->i_send_hdrs) {
 		ret = -ENOMEM;
 		rdsdebug("ib_dma_alloc_coherent send failed\n");
 		goto out;
@@ -316,7 +360,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 					   ic->i_recv_ring.w_nr *
 						sizeof(struct rds_header),
 					   &ic->i_recv_hdrs_dma, GFP_KERNEL);
-	if (ic->i_recv_hdrs == NULL) {
+	if (!ic->i_recv_hdrs) {
 		ret = -ENOMEM;
 		rdsdebug("ib_dma_alloc_coherent recv failed\n");
 		goto out;
@@ -324,22 +368,24 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 
 	ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
 				       &ic->i_ack_dma, GFP_KERNEL);
-	if (ic->i_ack == NULL) {
+	if (!ic->i_ack) {
 		ret = -ENOMEM;
 		rdsdebug("ib_dma_alloc_coherent ack failed\n");
 		goto out;
 	}
 
-	ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
-	if (ic->i_sends == NULL) {
+	ic->i_sends = vmalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
+				   ibdev_to_node(dev));
+	if (!ic->i_sends) {
 		ret = -ENOMEM;
 		rdsdebug("send allocation failed\n");
 		goto out;
 	}
 	memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
 
-	ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
-	if (ic->i_recvs == NULL) {
+	ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
+				   ibdev_to_node(dev));
+	if (!ic->i_recvs) {
 		ret = -ENOMEM;
 		rdsdebug("recv allocation failed\n");
 		goto out;
@@ -352,6 +398,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 		 ic->i_send_cq, ic->i_recv_cq);
 
 out:
+	rds_ib_dev_put(rds_ibdev);
 	return ret;
 }
 
@@ -409,7 +456,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 	struct rds_ib_connection *ic = NULL;
 	struct rdma_conn_param conn_param;
 	u32 version;
-	int err, destroy = 1;
+	int err = 1, destroy = 1;
 
 	/* Check whether the remote protocol version matches ours. */
 	version = rds_ib_protocol_compatible(event);
@@ -448,7 +495,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 			/* Wait and see - our connect may still be succeeding */
 			rds_ib_stats_inc(s_ib_connect_raced);
 		}
-		mutex_unlock(&conn->c_cm_lock);
 		goto out;
 	}
 
@@ -475,24 +521,23 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 	err = rds_ib_setup_qp(conn);
 	if (err) {
 		rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
-		mutex_unlock(&conn->c_cm_lock);
 		goto out;
 	}
 
-	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
+	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
+		event->param.conn.responder_resources,
+		event->param.conn.initiator_depth);
 
 	/* rdma_accept() calls rdma_reject() internally if it fails */
 	err = rdma_accept(cm_id, &conn_param);
-	mutex_unlock(&conn->c_cm_lock);
-	if (err) {
+	if (err)
 		rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
-		goto out;
-	}
-
-	return 0;
 
 out:
-	rdma_reject(cm_id, NULL, 0);
+	if (conn)
+		mutex_unlock(&conn->c_cm_lock);
+	if (err)
+		rdma_reject(cm_id, NULL, 0);
 	return destroy;
 }
 
@@ -516,8 +561,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
 		goto out;
 	}
 
-	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
-
+	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
+		UINT_MAX, UINT_MAX);
 	ret = rdma_connect(cm_id, &conn_param);
 	if (ret)
 		rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
@@ -601,9 +646,19 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
 				ic->i_cm_id, err);
 		}
 
+		/*
+		 * We want to wait for tx and rx completion to finish
+		 * before we tear down the connection, but we have to be
+		 * careful not to get stuck waiting on a send ring that
+		 * only has unsignaled sends in it.  We've shutdown new
+		 * sends before getting here so by waiting for signaled
+		 * sends to complete we're ensured that there will be no
+		 * more tx processing.
+		 */
 		wait_event(rds_ib_ring_empty_wait,
-			rds_ib_ring_empty(&ic->i_send_ring) &&
-			rds_ib_ring_empty(&ic->i_recv_ring));
+			   rds_ib_ring_empty(&ic->i_recv_ring) &&
+			   (atomic_read(&ic->i_signaled_sends) == 0));
+		tasklet_kill(&ic->i_recv_tasklet);
 
 		if (ic->i_send_hdrs)
 			ib_dma_free_coherent(dev,
@@ -654,9 +709,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
 	BUG_ON(ic->rds_ibdev);
 
 	/* Clear pending transmit */
-	if (ic->i_rm) {
-		rds_message_put(ic->i_rm);
-		ic->i_rm = NULL;
+	if (ic->i_data_op) {
+		struct rds_message *rm;
+
+		rm = container_of(ic->i_data_op, struct rds_message, data);
+		rds_message_put(rm);
+		ic->i_data_op = NULL;
 	}
 
 	/* Clear the ACK state */
@@ -690,12 +748,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 {
 	struct rds_ib_connection *ic;
 	unsigned long flags;
+	int ret;
 
 	/* XXX too lazy? */
 	ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
-	if (ic == NULL)
+	if (!ic)
 		return -ENOMEM;
 
+	ret = rds_ib_recv_alloc_caches(ic);
+	if (ret) {
+		kfree(ic);
+		return ret;
+	}
+
 	INIT_LIST_HEAD(&ic->ib_node);
 	tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
 		     (unsigned long) ic);
@@ -703,6 +768,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 #ifndef KERNEL_HAS_ATOMIC64
 	spin_lock_init(&ic->i_ack_lock);
 #endif
+	atomic_set(&ic->i_signaled_sends, 0);
 
 	/*
 	 * rds_ib_conn_shutdown() waits for these to be emptied so they
@@ -744,6 +810,8 @@ void rds_ib_conn_free(void *arg)
 	list_del(&ic->ib_node);
 	spin_unlock_irq(lock_ptr);
 
+	rds_ib_recv_free_caches(ic);
+
 	kfree(ic);
 }
 
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index a54cd63f9e3..18a833c450c 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -32,11 +32,16 @@
  */
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/rculist.h>
 
 #include "rds.h"
-#include "rdma.h"
 #include "ib.h"
+#include "xlist.h"
 
+static struct workqueue_struct *rds_ib_fmr_wq;
+
+static DEFINE_PER_CPU(unsigned long, clean_list_grace);
+#define CLEAN_LIST_BUSY_BIT 0
 
 /*
  * This is stored as mr->r_trans_private.
@@ -45,7 +50,11 @@ struct rds_ib_mr {
 	struct rds_ib_device	*device;
 	struct rds_ib_mr_pool	*pool;
 	struct ib_fmr		*fmr;
-	struct list_head	list;
+
+	struct xlist_head	xlist;
+
+	/* unmap_list is for freeing */
+	struct list_head	unmap_list;
 	unsigned int		remap_count;
 
 	struct scatterlist	*sg;
@@ -59,14 +68,16 @@ struct rds_ib_mr {
  */
 struct rds_ib_mr_pool {
 	struct mutex		flush_lock;		/* serialize fmr invalidate */
-	struct work_struct	flush_worker;		/* flush worker */
+	struct delayed_work	flush_worker;		/* flush worker */
 
-	spinlock_t		list_lock;		/* protect variables below */
 	atomic_t		item_count;		/* total # of MRs */
 	atomic_t		dirty_count;		/* # dirty of MRs */
-	struct list_head	drop_list;		/* MRs that have reached their max_maps limit */
-	struct list_head	free_list;		/* unused MRs */
-	struct list_head	clean_list;		/* unused & unamapped MRs */
+
+	struct xlist_head	drop_list;		/* MRs that have reached their max_maps limit */
+	struct xlist_head	free_list;		/* unused MRs */
+	struct xlist_head	clean_list;		/* global unused & unamapped MRs */
+	wait_queue_head_t	flush_wait;
+
 	atomic_t		free_pinned;		/* memory pinned by free MRs */
 	unsigned long		max_items;
 	unsigned long		max_items_soft;
@@ -74,7 +85,7 @@ struct rds_ib_mr_pool {
 	struct ib_fmr_attr	fmr_attr;
 };
 
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all);
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
 static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
 static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
 
@@ -83,16 +94,17 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
 	struct rds_ib_device *rds_ibdev;
 	struct rds_ib_ipaddr *i_ipaddr;
 
-	list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
-		spin_lock_irq(&rds_ibdev->spinlock);
-		list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
+		list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
 			if (i_ipaddr->ipaddr == ipaddr) {
-				spin_unlock_irq(&rds_ibdev->spinlock);
+				atomic_inc(&rds_ibdev->refcount);
+				rcu_read_unlock();
 				return rds_ibdev;
 			}
 		}
-		spin_unlock_irq(&rds_ibdev->spinlock);
 	}
+	rcu_read_unlock();
 
 	return NULL;
 }
@@ -108,7 +120,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 	i_ipaddr->ipaddr = ipaddr;
 
 	spin_lock_irq(&rds_ibdev->spinlock);
-	list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
+	list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
 	spin_unlock_irq(&rds_ibdev->spinlock);
 
 	return 0;
@@ -116,17 +128,24 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 
 static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 {
-	struct rds_ib_ipaddr *i_ipaddr, *next;
+	struct rds_ib_ipaddr *i_ipaddr;
+	struct rds_ib_ipaddr *to_free = NULL;
+
 
 	spin_lock_irq(&rds_ibdev->spinlock);
-	list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) {
+	list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
 		if (i_ipaddr->ipaddr == ipaddr) {
-			list_del(&i_ipaddr->list);
-			kfree(i_ipaddr);
+			list_del_rcu(&i_ipaddr->list);
+			to_free = i_ipaddr;
 			break;
 		}
 	}
 	spin_unlock_irq(&rds_ibdev->spinlock);
+
+	if (to_free) {
+		synchronize_rcu();
+		kfree(to_free);
+	}
 }
 
 int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
@@ -134,8 +153,10 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 	struct rds_ib_device *rds_ibdev_old;
 
 	rds_ibdev_old = rds_ib_get_device(ipaddr);
-	if (rds_ibdev_old)
+	if (rds_ibdev_old) {
 		rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
+		rds_ib_dev_put(rds_ibdev_old);
+	}
 
 	return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
 }
@@ -150,12 +171,13 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con
 	BUG_ON(list_empty(&ic->ib_node));
 	list_del(&ic->ib_node);
 
-	spin_lock_irq(&rds_ibdev->spinlock);
+	spin_lock(&rds_ibdev->spinlock);
 	list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
-	spin_unlock_irq(&rds_ibdev->spinlock);
+	spin_unlock(&rds_ibdev->spinlock);
 	spin_unlock_irq(&ib_nodev_conns_lock);
 
 	ic->rds_ibdev = rds_ibdev;
+	atomic_inc(&rds_ibdev->refcount);
 }
 
 void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
@@ -175,18 +197,18 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *
 	spin_unlock(&ib_nodev_conns_lock);
 
 	ic->rds_ibdev = NULL;
+	rds_ib_dev_put(rds_ibdev);
 }
 
-void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock)
+void rds_ib_destroy_nodev_conns(void)
 {
 	struct rds_ib_connection *ic, *_ic;
 	LIST_HEAD(tmp_list);
 
 	/* avoid calling conn_destroy with irqs off */
-	spin_lock_irq(list_lock);
-	list_splice(list, &tmp_list);
-	INIT_LIST_HEAD(list);
-	spin_unlock_irq(list_lock);
+	spin_lock_irq(&ib_nodev_conns_lock);
+	list_splice(&ib_nodev_conns, &tmp_list);
+	spin_unlock_irq(&ib_nodev_conns_lock);
 
 	list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
 		rds_conn_destroy(ic->conn);
@@ -200,12 +222,12 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
 	if (!pool)
 		return ERR_PTR(-ENOMEM);
 
-	INIT_LIST_HEAD(&pool->free_list);
-	INIT_LIST_HEAD(&pool->drop_list);
-	INIT_LIST_HEAD(&pool->clean_list);
+	INIT_XLIST_HEAD(&pool->free_list);
+	INIT_XLIST_HEAD(&pool->drop_list);
+	INIT_XLIST_HEAD(&pool->clean_list);
 	mutex_init(&pool->flush_lock);
-	spin_lock_init(&pool->list_lock);
-	INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+	init_waitqueue_head(&pool->flush_wait);
+	INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
 
 	pool->fmr_attr.max_pages = fmr_message_size;
 	pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
@@ -233,34 +255,60 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
 
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
 {
-	flush_workqueue(rds_wq);
-	rds_ib_flush_mr_pool(pool, 1);
+	cancel_delayed_work_sync(&pool->flush_worker);
+	rds_ib_flush_mr_pool(pool, 1, NULL);
 	WARN_ON(atomic_read(&pool->item_count));
 	WARN_ON(atomic_read(&pool->free_pinned));
 	kfree(pool);
 }
 
+static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl,
+			 struct rds_ib_mr **ibmr_ret)
+{
+	struct xlist_head *ibmr_xl;
+	ibmr_xl = xlist_del_head_fast(xl);
+	*ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist);
+}
+
 static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
 {
 	struct rds_ib_mr *ibmr = NULL;
-	unsigned long flags;
+	struct xlist_head *ret;
+	unsigned long *flag;
 
-	spin_lock_irqsave(&pool->list_lock, flags);
-	if (!list_empty(&pool->clean_list)) {
-		ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list);
-		list_del_init(&ibmr->list);
-	}
-	spin_unlock_irqrestore(&pool->list_lock, flags);
+	preempt_disable();
+	flag = &__get_cpu_var(clean_list_grace);
+	set_bit(CLEAN_LIST_BUSY_BIT, flag);
+	ret = xlist_del_head(&pool->clean_list);
+	if (ret)
+		ibmr = list_entry(ret, struct rds_ib_mr, xlist);
 
+	clear_bit(CLEAN_LIST_BUSY_BIT, flag);
+	preempt_enable();
 	return ibmr;
 }
 
+static inline void wait_clean_list_grace(void)
+{
+	int cpu;
+	unsigned long *flag;
+
+	for_each_online_cpu(cpu) {
+		flag = &per_cpu(clean_list_grace, cpu);
+		while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
+			cpu_relax();
+	}
+}
+
 static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
 {
 	struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
 	struct rds_ib_mr *ibmr = NULL;
 	int err = 0, iter = 0;
 
+	if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+		queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
+
 	while (1) {
 		ibmr = rds_ib_reuse_fmr(pool);
 		if (ibmr)
@@ -287,19 +335,24 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
 
 		/* We do have some empty MRs. Flush them out. */
 		rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
-		rds_ib_flush_mr_pool(pool, 0);
+		rds_ib_flush_mr_pool(pool, 0, &ibmr);
+		if (ibmr)
+			return ibmr;
 	}
 
-	ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+	ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
 	if (!ibmr) {
 		err = -ENOMEM;
 		goto out_no_cigar;
 	}
 
+	memset(ibmr, 0, sizeof(*ibmr));
+
 	ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
 			(IB_ACCESS_LOCAL_WRITE |
 			 IB_ACCESS_REMOTE_READ |
-			 IB_ACCESS_REMOTE_WRITE),
+			 IB_ACCESS_REMOTE_WRITE|
+			 IB_ACCESS_REMOTE_ATOMIC),
 			&pool->fmr_attr);
 	if (IS_ERR(ibmr->fmr)) {
 		err = PTR_ERR(ibmr->fmr);
@@ -367,7 +420,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
 	if (page_cnt > fmr_message_size)
 		return -EINVAL;
 
-	dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC);
+	dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
+				 rdsibdev_to_node(rds_ibdev));
 	if (!dma_pages)
 		return -ENOMEM;
 
@@ -441,7 +495,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
 
 			/* FIXME we need a way to tell a r/w MR
 			 * from a r/o MR */
-			BUG_ON(in_interrupt());
+			BUG_ON(irqs_disabled());
 			set_page_dirty(page);
 			put_page(page);
 		}
@@ -477,33 +531,109 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr
 }
 
 /*
+ * given an xlist of mrs, put them all into the list_head for more processing
+ */
+static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list)
+{
+	struct rds_ib_mr *ibmr;
+	struct xlist_head splice;
+	struct xlist_head *cur;
+	struct xlist_head *next;
+
+	splice.next = NULL;
+	xlist_splice(xlist, &splice);
+	cur = splice.next;
+	while (cur) {
+		next = cur->next;
+		ibmr = list_entry(cur, struct rds_ib_mr, xlist);
+		list_add_tail(&ibmr->unmap_list, list);
+		cur = next;
+	}
+}
+
+/*
+ * this takes a list head of mrs and turns it into an xlist of clusters.
+ * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for
+ * reuse.
+ */
+static void list_append_to_xlist(struct rds_ib_mr_pool *pool,
+				struct list_head *list, struct xlist_head *xlist,
+				struct xlist_head **tail_ret)
+{
+	struct rds_ib_mr *ibmr;
+	struct xlist_head *cur_mr = xlist;
+	struct xlist_head *tail_mr = NULL;
+
+	list_for_each_entry(ibmr, list, unmap_list) {
+		tail_mr = &ibmr->xlist;
+		tail_mr->next = NULL;
+		cur_mr->next = tail_mr;
+		cur_mr = tail_mr;
+	}
+	*tail_ret = tail_mr;
+}
+
+/*
  * Flush our pool of MRs.
  * At a minimum, all currently unused MRs are unmapped.
  * If the number of MRs allocated exceeds the limit, we also try
  * to free as many MRs as needed to get back to this limit.
  */
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
+			        int free_all, struct rds_ib_mr **ibmr_ret)
 {
 	struct rds_ib_mr *ibmr, *next;
+	struct xlist_head clean_xlist;
+	struct xlist_head *clean_tail;
 	LIST_HEAD(unmap_list);
 	LIST_HEAD(fmr_list);
 	unsigned long unpinned = 0;
-	unsigned long flags;
 	unsigned int nfreed = 0, ncleaned = 0, free_goal;
 	int ret = 0;
 
 	rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
 
-	mutex_lock(&pool->flush_lock);
+	if (ibmr_ret) {
+		DEFINE_WAIT(wait);
+		while(!mutex_trylock(&pool->flush_lock)) {
+			ibmr = rds_ib_reuse_fmr(pool);
+			if (ibmr) {
+				*ibmr_ret = ibmr;
+				finish_wait(&pool->flush_wait, &wait);
+				goto out_nolock;
+			}
+
+			prepare_to_wait(&pool->flush_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (xlist_empty(&pool->clean_list))
+				schedule();
+
+			ibmr = rds_ib_reuse_fmr(pool);
+			if (ibmr) {
+				*ibmr_ret = ibmr;
+				finish_wait(&pool->flush_wait, &wait);
+				goto out_nolock;
+			}
+		}
+		finish_wait(&pool->flush_wait, &wait);
+	} else
+		mutex_lock(&pool->flush_lock);
+
+	if (ibmr_ret) {
+		ibmr = rds_ib_reuse_fmr(pool);
+		if (ibmr) {
+			*ibmr_ret = ibmr;
+			goto out;
+		}
+	}
 
-	spin_lock_irqsave(&pool->list_lock, flags);
 	/* Get the list of all MRs to be dropped. Ordering matters -
-	 * we want to put drop_list ahead of free_list. */
-	list_splice_init(&pool->free_list, &unmap_list);
-	list_splice_init(&pool->drop_list, &unmap_list);
+	 * we want to put drop_list ahead of free_list.
+	 */
+	xlist_append_to_list(&pool->drop_list, &unmap_list);
+	xlist_append_to_list(&pool->free_list, &unmap_list);
 	if (free_all)
-		list_splice_init(&pool->clean_list, &unmap_list);
-	spin_unlock_irqrestore(&pool->list_lock, flags);
+		xlist_append_to_list(&pool->clean_list, &unmap_list);
 
 	free_goal = rds_ib_flush_goal(pool, free_all);
 
@@ -511,19 +641,20 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
 		goto out;
 
 	/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
-	list_for_each_entry(ibmr, &unmap_list, list)
+	list_for_each_entry(ibmr, &unmap_list, unmap_list)
 		list_add(&ibmr->fmr->list, &fmr_list);
+
 	ret = ib_unmap_fmr(&fmr_list);
 	if (ret)
 		printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
 
 	/* Now we can destroy the DMA mapping and unpin any pages */
-	list_for_each_entry_safe(ibmr, next, &unmap_list, list) {
+	list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
 		unpinned += ibmr->sg_len;
 		__rds_ib_teardown_mr(ibmr);
 		if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
 			rds_ib_stats_inc(s_ib_rdma_mr_free);
-			list_del(&ibmr->list);
+			list_del(&ibmr->unmap_list);
 			ib_dealloc_fmr(ibmr->fmr);
 			kfree(ibmr);
 			nfreed++;
@@ -531,9 +662,27 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
 		ncleaned++;
 	}
 
-	spin_lock_irqsave(&pool->list_lock, flags);
-	list_splice(&unmap_list, &pool->clean_list);
-	spin_unlock_irqrestore(&pool->list_lock, flags);
+	if (!list_empty(&unmap_list)) {
+		/* we have to make sure that none of the things we're about
+		 * to put on the clean list would race with other cpus trying
+		 * to pull items off.  The xlist would explode if we managed to
+		 * remove something from the clean list and then add it back again
+		 * while another CPU was spinning on that same item in xlist_del_head.
+		 *
+		 * This is pretty unlikely, but just in case  wait for an xlist grace period
+		 * here before adding anything back into the clean list.
+		 */
+		wait_clean_list_grace();
+
+		list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail);
+		if (ibmr_ret)
+			refill_local(pool, &clean_xlist, ibmr_ret);
+
+		/* refill_local may have emptied our list */
+		if (!xlist_empty(&clean_xlist))
+			xlist_add(clean_xlist.next, clean_tail, &pool->clean_list);
+
+	}
 
 	atomic_sub(unpinned, &pool->free_pinned);
 	atomic_sub(ncleaned, &pool->dirty_count);
@@ -541,14 +690,35 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
 
 out:
 	mutex_unlock(&pool->flush_lock);
+	if (waitqueue_active(&pool->flush_wait))
+		wake_up(&pool->flush_wait);
+out_nolock:
 	return ret;
 }
 
+int rds_ib_fmr_init(void)
+{
+	rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd");
+	if (!rds_ib_fmr_wq)
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ * By the time this is called all the IB devices should have been torn down and
+ * had their pools freed.  As each pool is freed its work struct is waited on,
+ * so the pool flushing work queue should be idle by the time we get here.
+ */
+void rds_ib_fmr_exit(void)
+{
+	destroy_workqueue(rds_ib_fmr_wq);
+}
+
 static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
 {
-	struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker);
+	struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
 
-	rds_ib_flush_mr_pool(pool, 0);
+	rds_ib_flush_mr_pool(pool, 0, NULL);
 }
 
 void rds_ib_free_mr(void *trans_private, int invalidate)
@@ -556,47 +726,49 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
 	struct rds_ib_mr *ibmr = trans_private;
 	struct rds_ib_device *rds_ibdev = ibmr->device;
 	struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
-	unsigned long flags;
 
 	rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
 
 	/* Return it to the pool's free list */
-	spin_lock_irqsave(&pool->list_lock, flags);
 	if (ibmr->remap_count >= pool->fmr_attr.max_maps)
-		list_add(&ibmr->list, &pool->drop_list);
+		xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list);
 	else
-		list_add(&ibmr->list, &pool->free_list);
+		xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list);
 
 	atomic_add(ibmr->sg_len, &pool->free_pinned);
 	atomic_inc(&pool->dirty_count);
-	spin_unlock_irqrestore(&pool->list_lock, flags);
 
 	/* If we've pinned too many pages, request a flush */
 	if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
 	    atomic_read(&pool->dirty_count) >= pool->max_items / 10)
-		queue_work(rds_wq, &pool->flush_worker);
+		queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
 
 	if (invalidate) {
 		if (likely(!in_interrupt())) {
-			rds_ib_flush_mr_pool(pool, 0);
+			rds_ib_flush_mr_pool(pool, 0, NULL);
 		} else {
 			/* We get here if the user created a MR marked
 			 * as use_once and invalidate at the same time. */
-			queue_work(rds_wq, &pool->flush_worker);
+			queue_delayed_work(rds_ib_fmr_wq,
+					   &pool->flush_worker, 10);
 		}
 	}
+
+	rds_ib_dev_put(rds_ibdev);
 }
 
 void rds_ib_flush_mrs(void)
 {
 	struct rds_ib_device *rds_ibdev;
 
+	down_read(&rds_ib_devices_lock);
 	list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
 		struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
 
 		if (pool)
-			rds_ib_flush_mr_pool(pool, 0);
+			rds_ib_flush_mr_pool(pool, 0, NULL);
 	}
+	up_read(&rds_ib_devices_lock);
 }
 
 void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
@@ -628,6 +800,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 		printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
 
 	ibmr->device = rds_ibdev;
+	rds_ibdev = NULL;
 
  out:
 	if (ret) {
@@ -635,5 +808,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 			rds_ib_free_mr(ibmr, 0);
 		ibmr = ERR_PTR(ret);
 	}
+	if (rds_ibdev)
+		rds_ib_dev_put(rds_ibdev);
 	return ibmr;
 }
+
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index c74e9904a6b..e29e0ca32f7 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -43,42 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab;
 static struct kmem_cache *rds_ib_frag_slab;
 static atomic_t	rds_ib_allocation = ATOMIC_INIT(0);
 
-static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
-{
-	rdsdebug("frag %p page %p\n", frag, frag->f_page);
-	__free_page(frag->f_page);
-	frag->f_page = NULL;
-}
-
-static void rds_ib_frag_free(struct rds_page_frag *frag)
-{
-	rdsdebug("frag %p page %p\n", frag, frag->f_page);
-	BUG_ON(frag->f_page != NULL);
-	kmem_cache_free(rds_ib_frag_slab, frag);
-}
-
-/*
- * We map a page at a time.  Its fragments are posted in order.  This
- * is called in fragment order as the fragments get send completion events.
- * Only the last frag in the page performs the unmapping.
- *
- * It's OK for ring cleanup to call this in whatever order it likes because
- * DMA is not in flight and so we can unmap while other ring entries still
- * hold page references in their frags.
- */
-static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
-				   struct rds_ib_recv_work *recv)
-{
-	struct rds_page_frag *frag = recv->r_frag;
-
-	rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
-	if (frag->f_mapped)
-		ib_dma_unmap_page(ic->i_cm_id->device,
-			       frag->f_mapped,
-			       RDS_FRAG_SIZE, DMA_FROM_DEVICE);
-	frag->f_mapped = 0;
-}
-
 void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
 {
 	struct rds_ib_recv_work *recv;
@@ -95,16 +59,161 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
 		recv->r_wr.sg_list = recv->r_sge;
 		recv->r_wr.num_sge = RDS_IB_RECV_SGE;
 
-		sge = rds_ib_data_sge(ic, recv->r_sge);
+		sge = &recv->r_sge[0];
+		sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+		sge->length = sizeof(struct rds_header);
+		sge->lkey = ic->i_mr->lkey;
+
+		sge = &recv->r_sge[1];
 		sge->addr = 0;
 		sge->length = RDS_FRAG_SIZE;
 		sge->lkey = ic->i_mr->lkey;
+	}
+}
 
-		sge = rds_ib_header_sge(ic, recv->r_sge);
-		sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
-		sge->length = sizeof(struct rds_header);
-		sge->lkey = ic->i_mr->lkey;
+/*
+ * The entire 'from' list, including the from element itself, is put on
+ * to the tail of the 'to' list.
+ */
+static void list_splice_entire_tail(struct list_head *from,
+				    struct list_head *to)
+{
+	struct list_head *from_last = from->prev;
+
+	list_splice_tail(from_last, to);
+	list_add_tail(from_last, to);
+}
+
+static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
+{
+	struct list_head *tmp;
+
+	tmp = xchg(&cache->xfer, NULL);
+	if (tmp) {
+		if (cache->ready)
+			list_splice_entire_tail(tmp, cache->ready);
+		else
+			cache->ready = tmp;
+	}
+}
+
+static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
+{
+	struct rds_ib_cache_head *head;
+	int cpu;
+
+	cache->percpu = alloc_percpu(struct rds_ib_cache_head);
+	if (!cache->percpu)
+	       return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		head = per_cpu_ptr(cache->percpu, cpu);
+		head->first = NULL;
+		head->count = 0;
+	}
+	cache->xfer = NULL;
+	cache->ready = NULL;
+
+	return 0;
+}
+
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
+{
+	int ret;
+
+	ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
+	if (!ret) {
+		ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
+		if (ret)
+			free_percpu(ic->i_cache_incs.percpu);
 	}
+
+	return ret;
+}
+
+static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
+					  struct list_head *caller_list)
+{
+	struct rds_ib_cache_head *head;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		head = per_cpu_ptr(cache->percpu, cpu);
+		if (head->first) {
+			list_splice_entire_tail(head->first, caller_list);
+			head->first = NULL;
+		}
+	}
+
+	if (cache->ready) {
+		list_splice_entire_tail(cache->ready, caller_list);
+		cache->ready = NULL;
+	}
+}
+
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
+{
+	struct rds_ib_incoming *inc;
+	struct rds_ib_incoming *inc_tmp;
+	struct rds_page_frag *frag;
+	struct rds_page_frag *frag_tmp;
+	LIST_HEAD(list);
+
+	rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+	rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
+	free_percpu(ic->i_cache_incs.percpu);
+
+	list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
+		list_del(&inc->ii_cache_entry);
+		WARN_ON(!list_empty(&inc->ii_frags));
+		kmem_cache_free(rds_ib_incoming_slab, inc);
+	}
+
+	rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+	rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
+	free_percpu(ic->i_cache_frags.percpu);
+
+	list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
+		list_del(&frag->f_cache_entry);
+		WARN_ON(!list_empty(&frag->f_item));
+		kmem_cache_free(rds_ib_frag_slab, frag);
+	}
+}
+
+/* fwd decl */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+				  struct rds_ib_refill_cache *cache);
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
+
+
+/* Recycle frag and attached recv buffer f_sg */
+static void rds_ib_frag_free(struct rds_ib_connection *ic,
+			     struct rds_page_frag *frag)
+{
+	rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
+
+	rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
+}
+
+/* Recycle inc after freeing attached frags */
+void rds_ib_inc_free(struct rds_incoming *inc)
+{
+	struct rds_ib_incoming *ibinc;
+	struct rds_page_frag *frag;
+	struct rds_page_frag *pos;
+	struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
+
+	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+
+	/* Free attached frags */
+	list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
+		list_del_init(&frag->f_item);
+		rds_ib_frag_free(ic, frag);
+	}
+	BUG_ON(!list_empty(&ibinc->ii_frags));
+
+	rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
+	rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
 }
 
 static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
@@ -115,10 +224,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
 		recv->r_ibinc = NULL;
 	}
 	if (recv->r_frag) {
-		rds_ib_recv_unmap_page(ic, recv);
-		if (recv->r_frag->f_page)
-			rds_ib_frag_drop_page(recv->r_frag);
-		rds_ib_frag_free(recv->r_frag);
+		ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+		rds_ib_frag_free(ic, recv->r_frag);
 		recv->r_frag = NULL;
 	}
 }
@@ -129,84 +236,111 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
 
 	for (i = 0; i < ic->i_recv_ring.w_nr; i++)
 		rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
-
-	if (ic->i_frag.f_page)
-		rds_ib_frag_drop_page(&ic->i_frag);
 }
 
-static int rds_ib_recv_refill_one(struct rds_connection *conn,
-				  struct rds_ib_recv_work *recv,
-				  gfp_t kptr_gfp, gfp_t page_gfp)
+static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
+						     gfp_t slab_mask)
 {
-	struct rds_ib_connection *ic = conn->c_transport_data;
-	dma_addr_t dma_addr;
-	struct ib_sge *sge;
-	int ret = -ENOMEM;
+	struct rds_ib_incoming *ibinc;
+	struct list_head *cache_item;
+	int avail_allocs;
 
-	if (recv->r_ibinc == NULL) {
-		if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) {
+	cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
+	if (cache_item) {
+		ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
+	} else {
+		avail_allocs = atomic_add_unless(&rds_ib_allocation,
+						 1, rds_ib_sysctl_max_recv_allocation);
+		if (!avail_allocs) {
 			rds_ib_stats_inc(s_ib_rx_alloc_limit);
-			goto out;
+			return NULL;
 		}
-		recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab,
-						 kptr_gfp);
-		if (recv->r_ibinc == NULL) {
+		ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
+		if (!ibinc) {
 			atomic_dec(&rds_ib_allocation);
-			goto out;
+			return NULL;
 		}
-		INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
-		rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
 	}
+	INIT_LIST_HEAD(&ibinc->ii_frags);
+	rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
 
-	if (recv->r_frag == NULL) {
-		recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp);
-		if (recv->r_frag == NULL)
-			goto out;
-		INIT_LIST_HEAD(&recv->r_frag->f_item);
-		recv->r_frag->f_page = NULL;
+	return ibinc;
+}
+
+static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
+						    gfp_t slab_mask, gfp_t page_mask)
+{
+	struct rds_page_frag *frag;
+	struct list_head *cache_item;
+	int ret;
+
+	cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
+	if (cache_item) {
+		frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
+	} else {
+		frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
+		if (!frag)
+			return NULL;
+
+		sg_init_table(&frag->f_sg, 1);
+		ret = rds_page_remainder_alloc(&frag->f_sg,
+					       RDS_FRAG_SIZE, page_mask);
+		if (ret) {
+			kmem_cache_free(rds_ib_frag_slab, frag);
+			return NULL;
+		}
 	}
 
-	if (ic->i_frag.f_page == NULL) {
-		ic->i_frag.f_page = alloc_page(page_gfp);
-		if (ic->i_frag.f_page == NULL)
-			goto out;
-		ic->i_frag.f_offset = 0;
+	INIT_LIST_HEAD(&frag->f_item);
+
+	return frag;
+}
+
+static int rds_ib_recv_refill_one(struct rds_connection *conn,
+				  struct rds_ib_recv_work *recv, int prefill)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_sge *sge;
+	int ret = -ENOMEM;
+	gfp_t slab_mask = GFP_NOWAIT;
+	gfp_t page_mask = GFP_NOWAIT;
+
+	if (prefill) {
+		slab_mask = GFP_KERNEL;
+		page_mask = GFP_HIGHUSER;
 	}
 
-	dma_addr = ib_dma_map_page(ic->i_cm_id->device,
-				  ic->i_frag.f_page,
-				  ic->i_frag.f_offset,
-				  RDS_FRAG_SIZE,
-				  DMA_FROM_DEVICE);
-	if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
-		goto out;
+	if (!ic->i_cache_incs.ready)
+		rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+	if (!ic->i_cache_frags.ready)
+		rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
 
 	/*
-	 * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap()
-	 * must be called on this recv.  This happens as completions hit
-	 * in order or on connection shutdown.
+	 * ibinc was taken from recv if recv contained the start of a message.
+	 * recvs that were continuations will still have this allocated.
 	 */
-	recv->r_frag->f_page = ic->i_frag.f_page;
-	recv->r_frag->f_offset = ic->i_frag.f_offset;
-	recv->r_frag->f_mapped = dma_addr;
+	if (!recv->r_ibinc) {
+		recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
+		if (!recv->r_ibinc)
+			goto out;
+	}
 
-	sge = rds_ib_data_sge(ic, recv->r_sge);
-	sge->addr = dma_addr;
-	sge->length = RDS_FRAG_SIZE;
+	WARN_ON(recv->r_frag); /* leak! */
+	recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
+	if (!recv->r_frag)
+		goto out;
+
+	ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
+			    1, DMA_FROM_DEVICE);
+	WARN_ON(ret != 1);
 
-	sge = rds_ib_header_sge(ic, recv->r_sge);
+	sge = &recv->r_sge[0];
 	sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
 	sge->length = sizeof(struct rds_header);
 
-	get_page(recv->r_frag->f_page);
-
-	if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
-		ic->i_frag.f_offset += RDS_FRAG_SIZE;
-	} else {
-		put_page(ic->i_frag.f_page);
-		ic->i_frag.f_page = NULL;
-		ic->i_frag.f_offset = 0;
-	}
+	sge = &recv->r_sge[1];
+	sge->addr = sg_dma_address(&recv->r_frag->f_sg);
+	sge->length = sg_dma_len(&recv->r_frag->f_sg);
 
 	ret = 0;
 out:
@@ -216,13 +350,11 @@ out:
 /*
  * This tries to allocate and post unused work requests after making sure that
  * they have all the allocations they need to queue received fragments into
- * sockets.  The i_recv_mutex is held here so that ring_alloc and _unalloc
- * pairs don't go unmatched.
+ * sockets.
  *
  * -1 is returned if posting fails due to temporary resource exhaustion.
  */
-int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
-		       gfp_t page_gfp, int prefill)
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
 	struct rds_ib_recv_work *recv;
@@ -236,28 +368,25 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
 		if (pos >= ic->i_recv_ring.w_nr) {
 			printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
 					pos);
-			ret = -EINVAL;
 			break;
 		}
 
 		recv = &ic->i_recvs[pos];
-		ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
+		ret = rds_ib_recv_refill_one(conn, recv, prefill);
 		if (ret) {
-			ret = -1;
 			break;
 		}
 
 		/* XXX when can this fail? */
 		ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
 		rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
-			 recv->r_ibinc, recv->r_frag->f_page,
-			 (long) recv->r_frag->f_mapped, ret);
+			 recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
+			 (long) sg_dma_address(&recv->r_frag->f_sg), ret);
 		if (ret) {
 			rds_ib_conn_error(conn, "recv post on "
 			       "%pI4 returned %d, disconnecting and "
 			       "reconnecting\n", &conn->c_faddr,
 			       ret);
-			ret = -1;
 			break;
 		}
 
@@ -270,37 +399,73 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
 
 	if (ret)
 		rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
-	return ret;
 }
 
-void rds_ib_inc_purge(struct rds_incoming *inc)
+/*
+ * We want to recycle several types of recv allocations, like incs and frags.
+ * To use this, the *_free() function passes in the ptr to a list_head within
+ * the recyclee, as well as the cache to put it on.
+ *
+ * First, we put the memory on a percpu list. When this reaches a certain size,
+ * We move it to an intermediate non-percpu list in a lockless manner, with some
+ * xchg/compxchg wizardry.
+ *
+ * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
+ * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
+ * list_empty() will return true with one element is actually present.
+ */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+				 struct rds_ib_refill_cache *cache)
 {
-	struct rds_ib_incoming *ibinc;
-	struct rds_page_frag *frag;
-	struct rds_page_frag *pos;
+	unsigned long flags;
+	struct rds_ib_cache_head *chp;
+	struct list_head *old;
 
-	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
-	rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
+	local_irq_save(flags);
 
-	list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
-		list_del_init(&frag->f_item);
-		rds_ib_frag_drop_page(frag);
-		rds_ib_frag_free(frag);
-	}
+	chp = per_cpu_ptr(cache->percpu, smp_processor_id());
+	if (!chp->first)
+		INIT_LIST_HEAD(new_item);
+	else /* put on front */
+		list_add_tail(new_item, chp->first);
+	chp->first = new_item;
+	chp->count++;
+
+	if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT)
+		goto end;
+
+	/*
+	 * Return our per-cpu first list to the cache's xfer by atomically
+	 * grabbing the current xfer list, appending it to our per-cpu list,
+	 * and then atomically returning that entire list back to the
+	 * cache's xfer list as long as it's still empty.
+	 */
+	do {
+		old = xchg(&cache->xfer, NULL);
+		if (old)
+			list_splice_entire_tail(old, chp->first);
+		old = cmpxchg(&cache->xfer, NULL, chp->first);
+	} while (old);
+
+	chp->first = NULL;
+	chp->count = 0;
+end:
+	local_irq_restore(flags);
 }
 
-void rds_ib_inc_free(struct rds_incoming *inc)
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
 {
-	struct rds_ib_incoming *ibinc;
-
-	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+	struct list_head *head = cache->ready;
+
+	if (head) {
+		if (!list_empty(head)) {
+			cache->ready = head->next;
+			list_del_init(head);
+		} else
+			cache->ready = NULL;
+	}
 
-	rds_ib_inc_purge(inc);
-	rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
-	BUG_ON(!list_empty(&ibinc->ii_frags));
-	kmem_cache_free(rds_ib_incoming_slab, ibinc);
-	atomic_dec(&rds_ib_allocation);
-	BUG_ON(atomic_read(&rds_ib_allocation) < 0);
+	return head;
 }
 
 int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
@@ -336,13 +501,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
 		to_copy = min_t(unsigned long, to_copy, len - copied);
 
 		rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
-			 "[%p, %lu] + %lu\n",
+			 "[%p, %u] + %lu\n",
 			 to_copy, iov->iov_base, iov->iov_len, iov_off,
-			 frag->f_page, frag->f_offset, frag_off);
+			 sg_page(&frag->f_sg), frag->f_sg.offset, frag_off);
 
 		/* XXX needs + offset for multiple recvs per page */
-		ret = rds_page_copy_to_user(frag->f_page,
-					    frag->f_offset + frag_off,
+		ret = rds_page_copy_to_user(sg_page(&frag->f_sg),
+					    frag->f_sg.offset + frag_off,
 					    iov->iov_base + iov_off,
 					    to_copy);
 		if (ret) {
@@ -557,47 +722,6 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
 	return rds_ib_get_ack(ic);
 }
 
-static struct rds_header *rds_ib_get_header(struct rds_connection *conn,
-					    struct rds_ib_recv_work *recv,
-					    u32 data_len)
-{
-	struct rds_ib_connection *ic = conn->c_transport_data;
-	void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs];
-	void *addr;
-	u32 misplaced_hdr_bytes;
-
-	/*
-	 * Support header at the front (RDS 3.1+) as well as header-at-end.
-	 *
-	 * Cases:
-	 * 1) header all in header buff (great!)
-	 * 2) header all in data page (copy all to header buff)
-	 * 3) header split across hdr buf + data page
-	 *    (move bit in hdr buff to end before copying other bit from data page)
-	 */
-	if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE)
-	        return hdr_buff;
-
-	if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) {
-		addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
-		memcpy(hdr_buff,
-		       addr + recv->r_frag->f_offset + data_len,
-		       sizeof(struct rds_header));
-		kunmap_atomic(addr, KM_SOFTIRQ0);
-		return hdr_buff;
-	}
-
-	misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len));
-
-	memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes);
-
-	addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
-	memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len,
-	       sizeof(struct rds_header) - misplaced_hdr_bytes);
-	kunmap_atomic(addr, KM_SOFTIRQ0);
-	return hdr_buff;
-}
-
 /*
  * It's kind of lame that we're copying from the posted receive pages into
  * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
@@ -639,7 +763,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
 		to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
 		BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
 
-		addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
+		addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0);
 
 		src = addr + frag_off;
 		dst = (void *)map->m_page_addrs[map_page] + map_off;
@@ -710,7 +834,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 	}
 	data_len -= sizeof(struct rds_header);
 
-	ihdr = rds_ib_get_header(conn, recv, data_len);
+	ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
 
 	/* Validate the checksum. */
 	if (!rds_message_verify_checksum(ihdr)) {
@@ -742,12 +866,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 		 * the inc is freed.  We don't go that route, so we have to drop the
 		 * page ref ourselves.  We can't just leave the page on the recv
 		 * because that confuses the dma mapping of pages and each recv's use
-		 * of a partial page.  We can leave the frag, though, it will be
-		 * reused.
+		 * of a partial page.
 		 *
 		 * FIXME: Fold this into the code path below.
 		 */
-		rds_ib_frag_drop_page(recv->r_frag);
+		rds_ib_frag_free(ic, recv->r_frag);
+		recv->r_frag = NULL;
 		return;
 	}
 
@@ -757,7 +881,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 	 * into the inc and save the inc so we can hang upcoming fragments
 	 * off its list.
 	 */
-	if (ibinc == NULL) {
+	if (!ibinc) {
 		ibinc = recv->r_ibinc;
 		recv->r_ibinc = NULL;
 		ic->i_ibinc = ibinc;
@@ -842,32 +966,38 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic,
 	struct rds_ib_recv_work *recv;
 
 	while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
-		rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
-			 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+		rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+			 (unsigned long long)wc.wr_id, wc.status,
+			 rds_ib_wc_status_str(wc.status), wc.byte_len,
 			 be32_to_cpu(wc.ex.imm_data));
 		rds_ib_stats_inc(s_ib_rx_cq_event);
 
 		recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
 
-		rds_ib_recv_unmap_page(ic, recv);
+		ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
 
 		/*
 		 * Also process recvs in connecting state because it is possible
 		 * to get a recv completion _before_ the rdmacm ESTABLISHED
 		 * event is processed.
 		 */
-		if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
+		if (wc.status == IB_WC_SUCCESS) {
+			rds_ib_process_recv(conn, recv, wc.byte_len, state);
+		} else {
 			/* We expect errors as the qp is drained during shutdown */
-			if (wc.status == IB_WC_SUCCESS) {
-				rds_ib_process_recv(conn, recv, wc.byte_len, state);
-			} else {
-				rds_ib_conn_error(conn, "recv completion on "
-				       "%pI4 had status %u, disconnecting and "
-				       "reconnecting\n", &conn->c_faddr,
-				       wc.status);
-			}
+			if (rds_conn_up(conn) || rds_conn_connecting(conn))
+				rds_ib_conn_error(conn, "recv completion on %pI4 had "
+						  "status %u (%s), disconnecting and "
+						  "reconnecting\n", &conn->c_faddr,
+						  wc.status,
+						  rds_ib_wc_status_str(wc.status));
 		}
 
+		/*
+		 * It's very important that we only free this ring entry if we've truly
+		 * freed the resources allocated to the entry.  The refilling path can
+		 * leak if we don't.
+		 */
 		rds_ib_ring_free(&ic->i_recv_ring, 1);
 	}
 }
@@ -897,11 +1027,8 @@ void rds_ib_recv_tasklet_fn(unsigned long data)
 	if (rds_ib_ring_empty(&ic->i_recv_ring))
 		rds_ib_stats_inc(s_ib_rx_ring_empty);
 
-	/*
-	 * If the ring is running low, then schedule the thread to refill.
-	 */
 	if (rds_ib_ring_low(&ic->i_recv_ring))
-		queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+		rds_ib_recv_refill(conn, 0);
 }
 
 int rds_ib_recv(struct rds_connection *conn)
@@ -910,25 +1037,13 @@ int rds_ib_recv(struct rds_connection *conn)
 	int ret = 0;
 
 	rdsdebug("conn %p\n", conn);
-
-	/*
-	 * If we get a temporary posting failure in this context then
-	 * we're really low and we want the caller to back off for a bit.
-	 */
-	mutex_lock(&ic->i_recv_mutex);
-	if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
-		ret = -ENOMEM;
-	else
-		rds_ib_stats_inc(s_ib_rx_refill_from_thread);
-	mutex_unlock(&ic->i_recv_mutex);
-
 	if (rds_conn_up(conn))
 		rds_ib_attempt_ack(ic);
 
 	return ret;
 }
 
-int __init rds_ib_recv_init(void)
+int rds_ib_recv_init(void)
 {
 	struct sysinfo si;
 	int ret = -ENOMEM;
@@ -939,14 +1054,14 @@ int __init rds_ib_recv_init(void)
 
 	rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
 					sizeof(struct rds_ib_incoming),
-					0, 0, NULL);
-	if (rds_ib_incoming_slab == NULL)
+					0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!rds_ib_incoming_slab)
 		goto out;
 
 	rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
 					sizeof(struct rds_page_frag),
-					0, 0, NULL);
-	if (rds_ib_frag_slab == NULL)
+					0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!rds_ib_frag_slab)
 		kmem_cache_destroy(rds_ib_incoming_slab);
 	else
 		ret = 0;
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 17fa80803ab..71f373c421b 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -36,11 +36,49 @@
 #include <linux/dmapool.h>
 
 #include "rds.h"
-#include "rdma.h"
 #include "ib.h"
 
-static void rds_ib_send_rdma_complete(struct rds_message *rm,
-				      int wc_status)
+static char *rds_ib_wc_status_strings[] = {
+#define RDS_IB_WC_STATUS_STR(foo) \
+		[IB_WC_##foo] = __stringify(IB_WC_##foo)
+	RDS_IB_WC_STATUS_STR(SUCCESS),
+	RDS_IB_WC_STATUS_STR(LOC_LEN_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_PROT_ERR),
+	RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR),
+	RDS_IB_WC_STATUS_STR(MW_BIND_ERR),
+	RDS_IB_WC_STATUS_STR(BAD_RESP_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR),
+	RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR),
+	RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR),
+	RDS_IB_WC_STATUS_STR(REM_OP_ERR),
+	RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR),
+	RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR),
+	RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR),
+	RDS_IB_WC_STATUS_STR(REM_ABORT_ERR),
+	RDS_IB_WC_STATUS_STR(INV_EECN_ERR),
+	RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR),
+	RDS_IB_WC_STATUS_STR(FATAL_ERR),
+	RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR),
+	RDS_IB_WC_STATUS_STR(GENERAL_ERR),
+#undef RDS_IB_WC_STATUS_STR
+};
+
+char *rds_ib_wc_status_str(enum ib_wc_status status)
+{
+	return rds_str_array(rds_ib_wc_status_strings,
+			     ARRAY_SIZE(rds_ib_wc_status_strings), status);
+}
+
+/*
+ * Convert IB-specific error message to RDS error message and call core
+ * completion handler.
+ */
+static void rds_ib_send_complete(struct rds_message *rm,
+				 int wc_status,
+				 void (*complete)(struct rds_message *rm, int status))
 {
 	int notify_status;
 
@@ -60,69 +98,125 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm,
 		notify_status = RDS_RDMA_OTHER_ERROR;
 		break;
 	}
-	rds_rdma_send_complete(rm, notify_status);
+	complete(rm, notify_status);
+}
+
+static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
+				   struct rm_data_op *op,
+				   int wc_status)
+{
+	if (op->op_nents)
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+				op->op_sg, op->op_nents,
+				DMA_TO_DEVICE);
 }
 
 static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
-				   struct rds_rdma_op *op)
+				   struct rm_rdma_op *op,
+				   int wc_status)
 {
-	if (op->r_mapped) {
+	if (op->op_mapped) {
 		ib_dma_unmap_sg(ic->i_cm_id->device,
-			op->r_sg, op->r_nents,
-			op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		op->r_mapped = 0;
+				op->op_sg, op->op_nents,
+				op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		op->op_mapped = 0;
 	}
+
+	/* If the user asked for a completion notification on this
+	 * message, we can implement three different semantics:
+	 *  1.	Notify when we received the ACK on the RDS message
+	 *	that was queued with the RDMA. This provides reliable
+	 *	notification of RDMA status at the expense of a one-way
+	 *	packet delay.
+	 *  2.	Notify when the IB stack gives us the completion event for
+	 *	the RDMA operation.
+	 *  3.	Notify when the IB stack gives us the completion event for
+	 *	the accompanying RDS messages.
+	 * Here, we implement approach #3. To implement approach #2,
+	 * we would need to take an event for the rdma WR. To implement #1,
+	 * don't call rds_rdma_send_complete at all, and fall back to the notify
+	 * handling in the ACK processing code.
+	 *
+	 * Note: There's no need to explicitly sync any RDMA buffers using
+	 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+	 * operation itself unmapped the RDMA buffers, which takes care
+	 * of synching.
+	 */
+	rds_ib_send_complete(container_of(op, struct rds_message, rdma),
+			     wc_status, rds_rdma_send_complete);
+
+	if (op->op_write)
+		rds_stats_add(s_send_rdma_bytes, op->op_bytes);
+	else
+		rds_stats_add(s_recv_rdma_bytes, op->op_bytes);
 }
 
-static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
-			  struct rds_ib_send_work *send,
-			  int wc_status)
+static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
+				     struct rm_atomic_op *op,
+				     int wc_status)
 {
-	struct rds_message *rm = send->s_rm;
-
-	rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
-
-	ib_dma_unmap_sg(ic->i_cm_id->device,
-		     rm->m_sg, rm->m_nents,
-		     DMA_TO_DEVICE);
-
-	if (rm->m_rdma_op != NULL) {
-		rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
-
-		/* If the user asked for a completion notification on this
-		 * message, we can implement three different semantics:
-		 *  1.	Notify when we received the ACK on the RDS message
-		 *	that was queued with the RDMA. This provides reliable
-		 *	notification of RDMA status at the expense of a one-way
-		 *	packet delay.
-		 *  2.	Notify when the IB stack gives us the completion event for
-		 *	the RDMA operation.
-		 *  3.	Notify when the IB stack gives us the completion event for
-		 *	the accompanying RDS messages.
-		 * Here, we implement approach #3. To implement approach #2,
-		 * call rds_rdma_send_complete from the cq_handler. To implement #1,
-		 * don't call rds_rdma_send_complete at all, and fall back to the notify
-		 * handling in the ACK processing code.
-		 *
-		 * Note: There's no need to explicitly sync any RDMA buffers using
-		 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
-		 * operation itself unmapped the RDMA buffers, which takes care
-		 * of synching.
-		 */
-		rds_ib_send_rdma_complete(rm, wc_status);
+	/* unmap atomic recvbuf */
+	if (op->op_mapped) {
+		ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
+				DMA_FROM_DEVICE);
+		op->op_mapped = 0;
+	}
 
-		if (rm->m_rdma_op->r_write)
-			rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
-		else
-			rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
+	rds_ib_send_complete(container_of(op, struct rds_message, atomic),
+			     wc_status, rds_atomic_send_complete);
+
+	if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
+		rds_ib_stats_inc(s_ib_atomic_cswp);
+	else
+		rds_ib_stats_inc(s_ib_atomic_fadd);
+}
+
+/*
+ * Unmap the resources associated with a struct send_work.
+ *
+ * Returns the rm for no good reason other than it is unobtainable
+ * other than by switching on wr.opcode, currently, and the caller,
+ * the event handler, needs it.
+ */
+static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic,
+						struct rds_ib_send_work *send,
+						int wc_status)
+{
+	struct rds_message *rm = NULL;
+
+	/* In the error case, wc.opcode sometimes contains garbage */
+	switch (send->s_wr.opcode) {
+	case IB_WR_SEND:
+		if (send->s_op) {
+			rm = container_of(send->s_op, struct rds_message, data);
+			rds_ib_send_unmap_data(ic, send->s_op, wc_status);
+		}
+		break;
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_READ:
+		if (send->s_op) {
+			rm = container_of(send->s_op, struct rds_message, rdma);
+			rds_ib_send_unmap_rdma(ic, send->s_op, wc_status);
+		}
+		break;
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+		if (send->s_op) {
+			rm = container_of(send->s_op, struct rds_message, atomic);
+			rds_ib_send_unmap_atomic(ic, send->s_op, wc_status);
+		}
+		break;
+	default:
+		if (printk_ratelimit())
+			printk(KERN_NOTICE
+			       "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
+			       __func__, send->s_wr.opcode);
+		break;
 	}
 
-	/* If anyone waited for this message to get flushed out, wake
-	 * them up now */
-	rds_message_unmapped(rm);
+	send->s_wr.opcode = 0xdead;
 
-	rds_message_put(rm);
-	send->s_rm = NULL;
+	return rm;
 }
 
 void rds_ib_send_init_ring(struct rds_ib_connection *ic)
@@ -133,23 +227,18 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
 	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
 		struct ib_sge *sge;
 
-		send->s_rm = NULL;
 		send->s_op = NULL;
 
 		send->s_wr.wr_id = i;
 		send->s_wr.sg_list = send->s_sge;
-		send->s_wr.num_sge = 1;
-		send->s_wr.opcode = IB_WR_SEND;
-		send->s_wr.send_flags = 0;
 		send->s_wr.ex.imm_data = 0;
 
-		sge = rds_ib_data_sge(ic, send->s_sge);
-		sge->lkey = ic->i_mr->lkey;
-
-		sge = rds_ib_header_sge(ic, send->s_sge);
+		sge = &send->s_sge[0];
 		sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
 		sge->length = sizeof(struct rds_header);
 		sge->lkey = ic->i_mr->lkey;
+
+		send->s_sge[1].lkey = ic->i_mr->lkey;
 	}
 }
 
@@ -159,16 +248,24 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
 	u32 i;
 
 	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
-		if (send->s_wr.opcode == 0xdead)
-			continue;
-		if (send->s_rm)
-			rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
-		if (send->s_op)
-			rds_ib_send_unmap_rdma(ic, send->s_op);
+		if (send->s_op && send->s_wr.opcode != 0xdead)
+			rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR);
 	}
 }
 
 /*
+ * The only fast path caller always has a non-zero nr, so we don't
+ * bother testing nr before performing the atomic sub.
+ */
+static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
+{
+	if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) &&
+	    waitqueue_active(&rds_ib_ring_empty_wait))
+		wake_up(&rds_ib_ring_empty_wait);
+	BUG_ON(atomic_read(&ic->i_signaled_sends) < 0);
+}
+
+/*
  * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
  * operations performed in the send path.  As the sender allocs and potentially
  * unallocs the next free entry in the ring it doesn't alter which is
@@ -178,12 +275,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 {
 	struct rds_connection *conn = context;
 	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_message *rm = NULL;
 	struct ib_wc wc;
 	struct rds_ib_send_work *send;
 	u32 completed;
 	u32 oldest;
 	u32 i = 0;
 	int ret;
+	int nr_sig = 0;
 
 	rdsdebug("cq %p conn %p\n", cq, conn);
 	rds_ib_stats_inc(s_ib_tx_cq_call);
@@ -192,8 +291,9 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
 
 	while (ib_poll_cq(cq, 1, &wc) > 0) {
-		rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
-			 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+		rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+			 (unsigned long long)wc.wr_id, wc.status,
+			 rds_ib_wc_status_str(wc.status), wc.byte_len,
 			 be32_to_cpu(wc.ex.imm_data));
 		rds_ib_stats_inc(s_ib_tx_cq_event);
 
@@ -210,51 +310,30 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 
 		for (i = 0; i < completed; i++) {
 			send = &ic->i_sends[oldest];
+			if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+				nr_sig++;
 
-			/* In the error case, wc.opcode sometimes contains garbage */
-			switch (send->s_wr.opcode) {
-			case IB_WR_SEND:
-				if (send->s_rm)
-					rds_ib_send_unmap_rm(ic, send, wc.status);
-				break;
-			case IB_WR_RDMA_WRITE:
-			case IB_WR_RDMA_READ:
-				/* Nothing to be done - the SG list will be unmapped
-				 * when the SEND completes. */
-				break;
-			default:
-				if (printk_ratelimit())
-					printk(KERN_NOTICE
-						"RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
-						__func__, send->s_wr.opcode);
-				break;
-			}
+			rm = rds_ib_send_unmap_op(ic, send, wc.status);
 
-			send->s_wr.opcode = 0xdead;
-			send->s_wr.num_sge = 1;
 			if (send->s_queued + HZ/2 < jiffies)
 				rds_ib_stats_inc(s_ib_tx_stalled);
 
-			/* If a RDMA operation produced an error, signal this right
-			 * away. If we don't, the subsequent SEND that goes with this
-			 * RDMA will be canceled with ERR_WFLUSH, and the application
-			 * never learn that the RDMA failed. */
-			if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
-				struct rds_message *rm;
-
-				rm = rds_send_get_message(conn, send->s_op);
-				if (rm) {
-					if (rm->m_rdma_op)
-						rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
-					rds_ib_send_rdma_complete(rm, wc.status);
-					rds_message_put(rm);
+			if (send->s_op) {
+				if (send->s_op == rm->m_final_op) {
+					/* If anyone waited for this message to get flushed out, wake
+					 * them up now */
+					rds_message_unmapped(rm);
 				}
+				rds_message_put(rm);
+				send->s_op = NULL;
 			}
 
 			oldest = (oldest + 1) % ic->i_send_ring.w_nr;
 		}
 
 		rds_ib_ring_free(&ic->i_send_ring, completed);
+		rds_ib_sub_signaled(ic, nr_sig);
+		nr_sig = 0;
 
 		if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
 		    test_bit(0, &conn->c_map_queued))
@@ -262,10 +341,10 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
 
 		/* We expect errors as the qp is drained during shutdown */
 		if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
-			rds_ib_conn_error(conn,
-				"send completion on %pI4 "
-				"had status %u, disconnecting and reconnecting\n",
-				&conn->c_faddr, wc.status);
+			rds_ib_conn_error(conn, "send completion on %pI4 had status "
+					  "%u (%s), disconnecting and reconnecting\n",
+					  &conn->c_faddr, wc.status,
+					  rds_ib_wc_status_str(wc.status));
 		}
 	}
 }
@@ -294,7 +373,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
  * credits (see rds_ib_send_add_credits below).
  *
  * The RDS send code is essentially single-threaded; rds_send_xmit
- * grabs c_send_lock to ensure exclusive access to the send ring.
+ * sets RDS_IN_XMIT to ensure exclusive access to the send ring.
  * However, the ACK sending code is independent and can race with
  * message SENDs.
  *
@@ -413,40 +492,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
 		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
 }
 
-static inline void
-rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
-		struct rds_ib_send_work *send, unsigned int pos,
-		unsigned long buffer, unsigned int length,
-		int send_flags)
+static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
+					     struct rds_ib_send_work *send,
+					     bool notify)
 {
-	struct ib_sge *sge;
-
-	WARN_ON(pos != send - ic->i_sends);
-
-	send->s_wr.send_flags = send_flags;
-	send->s_wr.opcode = IB_WR_SEND;
-	send->s_wr.num_sge = 2;
-	send->s_wr.next = NULL;
-	send->s_queued = jiffies;
-	send->s_op = NULL;
-
-	if (length != 0) {
-		sge = rds_ib_data_sge(ic, send->s_sge);
-		sge->addr = buffer;
-		sge->length = length;
-		sge->lkey = ic->i_mr->lkey;
-
-		sge = rds_ib_header_sge(ic, send->s_sge);
-	} else {
-		/* We're sending a packet with no payload. There is only
-		 * one SGE */
-		send->s_wr.num_sge = 1;
-		sge = &send->s_sge[0];
+	/*
+	 * We want to delay signaling completions just enough to get
+	 * the batching benefits but not so much that we create dead time
+	 * on the wire.
+	 */
+	if (ic->i_unsignaled_wrs-- == 0 || notify) {
+		ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+		send->s_wr.send_flags |= IB_SEND_SIGNALED;
+		return 1;
 	}
-
-	sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
-	sge->length = sizeof(struct rds_header);
-	sge->lkey = ic->i_mr->lkey;
+	return 0;
 }
 
 /*
@@ -475,13 +535,14 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	u32 pos;
 	u32 i;
 	u32 work_alloc;
-	u32 credit_alloc;
+	u32 credit_alloc = 0;
 	u32 posted;
 	u32 adv_credits = 0;
 	int send_flags = 0;
-	int sent;
+	int bytes_sent = 0;
 	int ret;
 	int flow_controlled = 0;
+	int nr_sig = 0;
 
 	BUG_ON(off % RDS_FRAG_SIZE);
 	BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
@@ -507,14 +568,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		goto out;
 	}
 
-	credit_alloc = work_alloc;
 	if (ic->i_flowctl) {
 		credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
 		adv_credits += posted;
 		if (credit_alloc < work_alloc) {
 			rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
 			work_alloc = credit_alloc;
-			flow_controlled++;
+			flow_controlled = 1;
 		}
 		if (work_alloc == 0) {
 			set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
@@ -525,31 +585,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 	}
 
 	/* map the message the first time we see it */
-	if (ic->i_rm == NULL) {
-		/*
-		printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n",
-				be16_to_cpu(rm->m_inc.i_hdr.h_dport),
-				rm->m_inc.i_hdr.h_flags,
-				be32_to_cpu(rm->m_inc.i_hdr.h_len));
-		   */
-		if (rm->m_nents) {
-			rm->m_count = ib_dma_map_sg(dev,
-					 rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
-			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
-			if (rm->m_count == 0) {
+	if (!ic->i_data_op) {
+		if (rm->data.op_nents) {
+			rm->data.op_count = ib_dma_map_sg(dev,
+							  rm->data.op_sg,
+							  rm->data.op_nents,
+							  DMA_TO_DEVICE);
+			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
+			if (rm->data.op_count == 0) {
 				rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
 				rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
 				ret = -ENOMEM; /* XXX ? */
 				goto out;
 			}
 		} else {
-			rm->m_count = 0;
+			rm->data.op_count = 0;
 		}
 
-		ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
-		ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
 		rds_message_addref(rm);
-		ic->i_rm = rm;
+		ic->i_data_op = &rm->data;
 
 		/* Finalize the header */
 		if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
@@ -559,10 +613,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 
 		/* If it has a RDMA op, tell the peer we did it. This is
 		 * used by the peer to release use-once RDMA MRs. */
-		if (rm->m_rdma_op) {
+		if (rm->rdma.op_active) {
 			struct rds_ext_header_rdma ext_hdr;
 
-			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
 			rds_message_add_extension(&rm->m_inc.i_hdr,
 					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
 		}
@@ -582,99 +636,77 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		/*
 		 * Update adv_credits since we reset the ACK_REQUIRED bit.
 		 */
-		rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
-		adv_credits += posted;
-		BUG_ON(adv_credits > 255);
+		if (ic->i_flowctl) {
+			rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
+			adv_credits += posted;
+			BUG_ON(adv_credits > 255);
+		}
 	}
 
-	send = &ic->i_sends[pos];
-	first = send;
-	prev = NULL;
-	scat = &rm->m_sg[sg];
-	sent = 0;
-	i = 0;
-
 	/* Sometimes you want to put a fence between an RDMA
 	 * READ and the following SEND.
 	 * We could either do this all the time
 	 * or when requested by the user. Right now, we let
 	 * the application choose.
 	 */
-	if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
+	if (rm->rdma.op_active && rm->rdma.op_fence)
 		send_flags = IB_SEND_FENCE;
 
-	/*
-	 * We could be copying the header into the unused tail of the page.
-	 * That would need to be changed in the future when those pages might
-	 * be mapped userspace pages or page cache pages.  So instead we always
-	 * use a second sge and our long-lived ring of mapped headers.  We send
-	 * the header after the data so that the data payload can be aligned on
-	 * the receiver.
-	 */
+	/* Each frag gets a header. Msgs may be 0 bytes */
+	send = &ic->i_sends[pos];
+	first = send;
+	prev = NULL;
+	scat = &ic->i_data_op->op_sg[sg];
+	i = 0;
+	do {
+		unsigned int len = 0;
 
-	/* handle a 0-len message */
-	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
-		rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
-		goto add_header;
-	}
+		/* Set up the header */
+		send->s_wr.send_flags = send_flags;
+		send->s_wr.opcode = IB_WR_SEND;
+		send->s_wr.num_sge = 1;
+		send->s_wr.next = NULL;
+		send->s_queued = jiffies;
+		send->s_op = NULL;
 
-	/* if there's data reference it with a chain of work reqs */
-	for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
-		unsigned int len;
+		send->s_sge[0].addr = ic->i_send_hdrs_dma
+			+ (pos * sizeof(struct rds_header));
+		send->s_sge[0].length = sizeof(struct rds_header);
 
-		send = &ic->i_sends[pos];
+		memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
 
-		len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
-		rds_ib_xmit_populate_wr(ic, send, pos,
-				ib_sg_dma_address(dev, scat) + off, len,
-				send_flags);
+		/* Set up the data, if present */
+		if (i < work_alloc
+		    && scat != &rm->data.op_sg[rm->data.op_count]) {
+			len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+			send->s_wr.num_sge = 2;
 
-		/*
-		 * We want to delay signaling completions just enough to get
-		 * the batching benefits but not so much that we create dead time
-		 * on the wire.
-		 */
-		if (ic->i_unsignaled_wrs-- == 0) {
-			ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
-			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
-		}
+			send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off;
+			send->s_sge[1].length = len;
 
-		ic->i_unsignaled_bytes -= len;
-		if (ic->i_unsignaled_bytes <= 0) {
-			ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
-			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+			bytes_sent += len;
+			off += len;
+			if (off == ib_sg_dma_len(dev, scat)) {
+				scat++;
+				off = 0;
+			}
 		}
 
+		rds_ib_set_wr_signal_state(ic, send, 0);
+
 		/*
 		 * Always signal the last one if we're stopping due to flow control.
 		 */
-		if (flow_controlled && i == (work_alloc-1))
+		if (ic->i_flowctl && flow_controlled && i == (work_alloc-1))
 			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
 
+		if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+			nr_sig++;
+
 		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
 			 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
 
-		sent += len;
-		off += len;
-		if (off == ib_sg_dma_len(dev, scat)) {
-			scat++;
-			off = 0;
-		}
-
-add_header:
-		/* Tack on the header after the data. The header SGE should already
-		 * have been set up to point to the right header buffer. */
-		memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
-
-		if (0) {
-			struct rds_header *hdr = &ic->i_send_hdrs[pos];
-
-			printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
-				be16_to_cpu(hdr->h_dport),
-				hdr->h_flags,
-				be32_to_cpu(hdr->h_len));
-		}
-		if (adv_credits) {
+		if (ic->i_flowctl && adv_credits) {
 			struct rds_header *hdr = &ic->i_send_hdrs[pos];
 
 			/* add credit and redo the header checksum */
@@ -689,20 +721,25 @@ add_header:
 		prev = send;
 
 		pos = (pos + 1) % ic->i_send_ring.w_nr;
-	}
+		send = &ic->i_sends[pos];
+		i++;
+
+	} while (i < work_alloc
+		 && scat != &rm->data.op_sg[rm->data.op_count]);
 
 	/* Account the RDS header in the number of bytes we sent, but just once.
 	 * The caller has no concept of fragmentation. */
 	if (hdr_off == 0)
-		sent += sizeof(struct rds_header);
+		bytes_sent += sizeof(struct rds_header);
 
 	/* if we finished the message then send completion owns it */
-	if (scat == &rm->m_sg[rm->m_count]) {
-		prev->s_rm = ic->i_rm;
-		prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
-		ic->i_rm = NULL;
+	if (scat == &rm->data.op_sg[rm->data.op_count]) {
+		prev->s_op = ic->i_data_op;
+		prev->s_wr.send_flags |= IB_SEND_SOLICITED;
+		ic->i_data_op = NULL;
 	}
 
+	/* Put back wrs & credits we didn't use */
 	if (i < work_alloc) {
 		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
 		work_alloc = i;
@@ -710,6 +747,9 @@ add_header:
 	if (ic->i_flowctl && i < credit_alloc)
 		rds_ib_send_add_credits(conn, credit_alloc - i);
 
+	if (nr_sig)
+		atomic_add(nr_sig, &ic->i_signaled_sends);
+
 	/* XXX need to worry about failed_wr and partial sends. */
 	failed_wr = &first->s_wr;
 	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
@@ -720,32 +760,127 @@ add_header:
 		printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
 		       "returned %d\n", &conn->c_faddr, ret);
 		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
-		if (prev->s_rm) {
-			ic->i_rm = prev->s_rm;
-			prev->s_rm = NULL;
+		rds_ib_sub_signaled(ic, nr_sig);
+		if (prev->s_op) {
+			ic->i_data_op = prev->s_op;
+			prev->s_op = NULL;
 		}
 
 		rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
 		goto out;
 	}
 
-	ret = sent;
+	ret = bytes_sent;
 out:
 	BUG_ON(adv_credits);
 	return ret;
 }
 
-int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
+/*
+ * Issue atomic operation.
+ * A simplified version of the rdma case, we always map 1 SG, and
+ * only 8 bytes, for the return value from the atomic operation.
+ */
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_send_work *send = NULL;
+	struct ib_send_wr *failed_wr;
+	struct rds_ib_device *rds_ibdev;
+	u32 pos;
+	u32 work_alloc;
+	int ret;
+	int nr_sig = 0;
+
+	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+
+	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
+	if (work_alloc != 1) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_stats_inc(s_ib_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* address of send request in ring */
+	send = &ic->i_sends[pos];
+	send->s_queued = jiffies;
+
+	if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
+		send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
+		send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare;
+		send->s_wr.wr.atomic.swap = op->op_m_cswp.swap;
+		send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask;
+		send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask;
+	} else { /* FADD */
+		send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
+		send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add;
+		send->s_wr.wr.atomic.swap = 0;
+		send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask;
+		send->s_wr.wr.atomic.swap_mask = 0;
+	}
+	nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+	send->s_wr.num_sge = 1;
+	send->s_wr.next = NULL;
+	send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
+	send->s_wr.wr.atomic.rkey = op->op_rkey;
+	send->s_op = op;
+	rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
+
+	/* map 8 byte retval buffer to the device */
+	ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
+	rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
+	if (ret != 1) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+		ret = -ENOMEM; /* XXX ? */
+		goto out;
+	}
+
+	/* Convert our struct scatterlist to struct ib_sge */
+	send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
+	send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
+	send->s_sge[0].lkey = ic->i_mr->lkey;
+
+	rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
+		 send->s_sge[0].addr, send->s_sge[0].length);
+
+	if (nr_sig)
+		atomic_add(nr_sig, &ic->i_signaled_sends);
+
+	failed_wr = &send->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
+	rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
+		 send, &send->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &send->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_sub_signaled(ic, nr_sig);
+		goto out;
+	}
+
+	if (unlikely(failed_wr != &send->s_wr)) {
+		printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+		BUG_ON(failed_wr != &send->s_wr);
+	}
+
+out:
+	return ret;
+}
+
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
 	struct rds_ib_send_work *send = NULL;
 	struct rds_ib_send_work *first;
 	struct rds_ib_send_work *prev;
 	struct ib_send_wr *failed_wr;
-	struct rds_ib_device *rds_ibdev;
 	struct scatterlist *scat;
 	unsigned long len;
-	u64 remote_addr = op->r_remote_addr;
+	u64 remote_addr = op->op_remote_addr;
+	u32 max_sge = ic->rds_ibdev->max_sge;
 	u32 pos;
 	u32 work_alloc;
 	u32 i;
@@ -753,29 +888,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	int sent;
 	int ret;
 	int num_sge;
-
-	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
-
-	/* map the message the first time we see it */
-	if (!op->r_mapped) {
-		op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
-					op->r_sg, op->r_nents, (op->r_write) ?
-					DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
-		if (op->r_count == 0) {
+	int nr_sig = 0;
+
+	/* map the op the first time we see it */
+	if (!op->op_mapped) {
+		op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
+					     op->op_sg, op->op_nents, (op->op_write) ?
+					     DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
+		if (op->op_count == 0) {
 			rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
 			ret = -ENOMEM; /* XXX ? */
 			goto out;
 		}
 
-		op->r_mapped = 1;
+		op->op_mapped = 1;
 	}
 
 	/*
 	 * Instead of knowing how to return a partial rdma read/write we insist that there
 	 * be enough work requests to send the entire message.
 	 */
-	i = ceil(op->r_count, rds_ibdev->max_sge);
+	i = ceil(op->op_count, max_sge);
 
 	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
 	if (work_alloc != i) {
@@ -788,30 +922,24 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	send = &ic->i_sends[pos];
 	first = send;
 	prev = NULL;
-	scat = &op->r_sg[0];
+	scat = &op->op_sg[0];
 	sent = 0;
-	num_sge = op->r_count;
+	num_sge = op->op_count;
 
-	for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
+	for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
 		send->s_wr.send_flags = 0;
 		send->s_queued = jiffies;
-		/*
-		 * We want to delay signaling completions just enough to get
-		 * the batching benefits but not so much that we create dead time on the wire.
-		 */
-		if (ic->i_unsignaled_wrs-- == 0) {
-			ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
-			send->s_wr.send_flags = IB_SEND_SIGNALED;
-		}
+		send->s_op = NULL;
+
+		nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
 
-		send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+		send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
 		send->s_wr.wr.rdma.remote_addr = remote_addr;
-		send->s_wr.wr.rdma.rkey = op->r_key;
-		send->s_op = op;
+		send->s_wr.wr.rdma.rkey = op->op_rkey;
 
-		if (num_sge > rds_ibdev->max_sge) {
-			send->s_wr.num_sge = rds_ibdev->max_sge;
-			num_sge -= rds_ibdev->max_sge;
+		if (num_sge > max_sge) {
+			send->s_wr.num_sge = max_sge;
+			num_sge -= max_sge;
 		} else {
 			send->s_wr.num_sge = num_sge;
 		}
@@ -821,7 +949,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 		if (prev)
 			prev->s_wr.next = &send->s_wr;
 
-		for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
+		for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
 			len = ib_sg_dma_len(ic->i_cm_id->device, scat);
 			send->s_sge[j].addr =
 				 ib_sg_dma_address(ic->i_cm_id->device, scat);
@@ -843,15 +971,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 			send = ic->i_sends;
 	}
 
-	/* if we finished the message then send completion owns it */
-	if (scat == &op->r_sg[op->r_count])
-		prev->s_wr.send_flags = IB_SEND_SIGNALED;
+	/* give a reference to the last op */
+	if (scat == &op->op_sg[op->op_count]) {
+		prev->s_op = op;
+		rds_message_addref(container_of(op, struct rds_message, rdma));
+	}
 
 	if (i < work_alloc) {
 		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
 		work_alloc = i;
 	}
 
+	if (nr_sig)
+		atomic_add(nr_sig, &ic->i_signaled_sends);
+
 	failed_wr = &first->s_wr;
 	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
 	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
@@ -861,6 +994,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 		printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
 		       "returned %d\n", &conn->c_faddr, ret);
 		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_sub_signaled(ic, nr_sig);
 		goto out;
 	}
 
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index d2c904dd6fb..2d5965d6e97 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -67,6 +67,8 @@ static const char *const rds_ib_stat_names[] = {
 	"ib_rdma_mr_pool_flush",
 	"ib_rdma_mr_pool_wait",
 	"ib_rdma_mr_pool_depleted",
+	"ib_atomic_cswp",
+	"ib_atomic_fadd",
 };
 
 unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
index 03f01cb4e0f..1253b006efd 100644
--- a/net/rds/ib_sysctl.c
+++ b/net/rds/ib_sysctl.c
@@ -49,10 +49,6 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
 static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
 static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
 
-unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
-static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
-static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
-
 /*
  * This sysctl does nothing.
  *
@@ -65,7 +61,7 @@ static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
  */
 unsigned int rds_ib_sysctl_flow_control = 0;
 
-ctl_table rds_ib_sysctl_table[] = {
+static ctl_table rds_ib_sysctl_table[] = {
 	{
 		.procname       = "max_send_wr",
 		.data		= &rds_ib_sysctl_max_send_wr,
@@ -94,15 +90,6 @@ ctl_table rds_ib_sysctl_table[] = {
 		.extra2		= &rds_ib_sysctl_max_unsig_wr_max,
 	},
 	{
-		.procname       = "max_unsignaled_bytes",
-		.data		= &rds_ib_sysctl_max_unsig_bytes,
-		.maxlen         = sizeof(unsigned long),
-		.mode           = 0644,
-		.proc_handler   = proc_doulongvec_minmax,
-		.extra1		= &rds_ib_sysctl_max_unsig_bytes_min,
-		.extra2		= &rds_ib_sysctl_max_unsig_bytes_max,
-	},
-	{
 		.procname       = "max_recv_allocation",
 		.data		= &rds_ib_sysctl_max_recv_allocation,
 		.maxlen         = sizeof(unsigned long),
@@ -132,10 +119,10 @@ void rds_ib_sysctl_exit(void)
 		unregister_sysctl_table(rds_ib_sysctl_hdr);
 }
 
-int __init rds_ib_sysctl_init(void)
+int rds_ib_sysctl_init(void)
 {
 	rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
-	if (rds_ib_sysctl_hdr == NULL)
+	if (!rds_ib_sysctl_hdr)
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/rds/info.c b/net/rds/info.c
index c45c4173a44..4fdf1b6e84f 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -76,7 +76,7 @@ void rds_info_register_func(int optname, rds_info_func func)
 	BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
 
 	spin_lock(&rds_info_lock);
-	BUG_ON(rds_info_funcs[offset] != NULL);
+	BUG_ON(rds_info_funcs[offset]);
 	rds_info_funcs[offset] = func;
 	spin_unlock(&rds_info_lock);
 }
@@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(rds_info_deregister_func);
  */
 void rds_info_iter_unmap(struct rds_info_iterator *iter)
 {
-	if (iter->addr != NULL) {
+	if (iter->addr) {
 		kunmap_atomic(iter->addr, KM_USER0);
 		iter->addr = NULL;
 	}
@@ -117,7 +117,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
 	unsigned long this;
 
 	while (bytes) {
-		if (iter->addr == NULL)
+		if (!iter->addr)
 			iter->addr = kmap_atomic(*iter->pages, KM_USER0);
 
 		this = min(bytes, PAGE_SIZE - iter->offset);
@@ -188,7 +188,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
 			>> PAGE_SHIFT;
 
 	pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
-	if (pages == NULL) {
+	if (!pages) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -206,7 +206,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
 
 call_func:
 	func = rds_info_funcs[optname - RDS_INFO_FIRST];
-	if (func == NULL) {
+	if (!func) {
 		ret = -ENOPROTOOPT;
 		goto out;
 	}
@@ -234,7 +234,7 @@ call_func:
 		ret = -EFAULT;
 
 out:
-	for (i = 0; pages != NULL && i < nr_pages; i++)
+	for (i = 0; pages && i < nr_pages; i++)
 		put_page(pages[i]);
 	kfree(pages);
 
diff --git a/net/rds/iw.c b/net/rds/iw.c
index c8f3d3525cb..5a9676fe594 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -56,7 +56,7 @@ struct list_head rds_iw_devices;
 DEFINE_SPINLOCK(iw_nodev_conns_lock);
 LIST_HEAD(iw_nodev_conns);
 
-void rds_iw_add_one(struct ib_device *device)
+static void rds_iw_add_one(struct ib_device *device)
 {
 	struct rds_iw_device *rds_iwdev;
 	struct ib_device_attr *dev_attr;
@@ -124,7 +124,7 @@ free_attr:
 	kfree(dev_attr);
 }
 
-void rds_iw_remove_one(struct ib_device *device)
+static void rds_iw_remove_one(struct ib_device *device)
 {
 	struct rds_iw_device *rds_iwdev;
 	struct rds_iw_cm_id *i_cm_id, *next;
@@ -264,7 +264,6 @@ struct rds_transport rds_iw_transport = {
 	.laddr_check		= rds_iw_laddr_check,
 	.xmit_complete		= rds_iw_xmit_complete,
 	.xmit			= rds_iw_xmit,
-	.xmit_cong_map		= NULL,
 	.xmit_rdma		= rds_iw_xmit_rdma,
 	.recv			= rds_iw_recv,
 	.conn_alloc		= rds_iw_conn_alloc,
@@ -272,7 +271,6 @@ struct rds_transport rds_iw_transport = {
 	.conn_connect		= rds_iw_conn_connect,
 	.conn_shutdown		= rds_iw_conn_shutdown,
 	.inc_copy_to_user	= rds_iw_inc_copy_to_user,
-	.inc_purge		= rds_iw_inc_purge,
 	.inc_free		= rds_iw_inc_free,
 	.cm_initiate_connect	= rds_iw_cm_initiate_connect,
 	.cm_handle_connect	= rds_iw_cm_handle_connect,
@@ -289,7 +287,7 @@ struct rds_transport rds_iw_transport = {
 	.t_prefer_loopback	= 1,
 };
 
-int __init rds_iw_init(void)
+int rds_iw_init(void)
 {
 	int ret;
 
diff --git a/net/rds/iw.h b/net/rds/iw.h
index eef2f0c2847..90151922178 100644
--- a/net/rds/iw.h
+++ b/net/rds/iw.h
@@ -70,7 +70,7 @@ struct rds_iw_send_work {
 	struct rds_message	*s_rm;
 
 	/* We should really put these into a union: */
-	struct rds_rdma_op	*s_op;
+	struct rm_rdma_op	*s_op;
 	struct rds_iw_mapping	*s_mapping;
 	struct ib_mr		*s_mr;
 	struct ib_fast_reg_page_list *s_page_list;
@@ -268,8 +268,6 @@ static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
 
 /* ib.c */
 extern struct rds_transport rds_iw_transport;
-extern void rds_iw_add_one(struct ib_device *device);
-extern void rds_iw_remove_one(struct ib_device *device);
 extern struct ib_client rds_iw_client;
 
 extern unsigned int fastreg_pool_size;
@@ -284,7 +282,7 @@ void rds_iw_conn_free(void *arg);
 int rds_iw_conn_connect(struct rds_connection *conn);
 void rds_iw_conn_shutdown(struct rds_connection *conn);
 void rds_iw_state_change(struct sock *sk);
-int __init rds_iw_listen_init(void);
+int rds_iw_listen_init(void);
 void rds_iw_listen_stop(void);
 void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
 int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
@@ -318,15 +316,13 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
 void rds_iw_sync_mr(void *trans_private, int dir);
 void rds_iw_free_mr(void *trans_private, int invalidate);
 void rds_iw_flush_mrs(void);
-void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
 
 /* ib_recv.c */
-int __init rds_iw_recv_init(void);
+int rds_iw_recv_init(void);
 void rds_iw_recv_exit(void);
 int rds_iw_recv(struct rds_connection *conn);
 int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
 		       gfp_t page_gfp, int prefill);
-void rds_iw_inc_purge(struct rds_incoming *inc);
 void rds_iw_inc_free(struct rds_incoming *inc);
 int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
 			     size_t size);
@@ -358,7 +354,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
 void rds_iw_send_init_ring(struct rds_iw_connection *ic);
 void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
 void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
 void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
 int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
@@ -371,7 +367,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
 				    unsigned int avail);
 
 /* ib_sysctl.c */
-int __init rds_iw_sysctl_init(void);
+int rds_iw_sysctl_init(void);
 void rds_iw_sysctl_exit(void);
 extern unsigned long rds_iw_sysctl_max_send_wr;
 extern unsigned long rds_iw_sysctl_max_recv_wr;
@@ -379,7 +375,6 @@ extern unsigned long rds_iw_sysctl_max_unsig_wrs;
 extern unsigned long rds_iw_sysctl_max_unsig_bytes;
 extern unsigned long rds_iw_sysctl_max_recv_allocation;
 extern unsigned int rds_iw_sysctl_flow_control;
-extern ctl_table rds_iw_sysctl_table[];
 
 /*
  * Helper functions for getting/setting the header and data SGEs in
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
index b5dd6ac39be..712cf2d1f28 100644
--- a/net/rds/iw_cm.c
+++ b/net/rds/iw_cm.c
@@ -257,7 +257,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
 	 * the rds_iwdev at all.
 	 */
 	rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
-	if (rds_iwdev == NULL) {
+	if (!rds_iwdev) {
 		if (printk_ratelimit())
 			printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
 					dev->name);
@@ -292,7 +292,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
 					   ic->i_send_ring.w_nr *
 						sizeof(struct rds_header),
 					   &ic->i_send_hdrs_dma, GFP_KERNEL);
-	if (ic->i_send_hdrs == NULL) {
+	if (!ic->i_send_hdrs) {
 		ret = -ENOMEM;
 		rdsdebug("ib_dma_alloc_coherent send failed\n");
 		goto out;
@@ -302,7 +302,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
 					   ic->i_recv_ring.w_nr *
 						sizeof(struct rds_header),
 					   &ic->i_recv_hdrs_dma, GFP_KERNEL);
-	if (ic->i_recv_hdrs == NULL) {
+	if (!ic->i_recv_hdrs) {
 		ret = -ENOMEM;
 		rdsdebug("ib_dma_alloc_coherent recv failed\n");
 		goto out;
@@ -310,14 +310,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
 
 	ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
 				       &ic->i_ack_dma, GFP_KERNEL);
-	if (ic->i_ack == NULL) {
+	if (!ic->i_ack) {
 		ret = -ENOMEM;
 		rdsdebug("ib_dma_alloc_coherent ack failed\n");
 		goto out;
 	}
 
 	ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
-	if (ic->i_sends == NULL) {
+	if (!ic->i_sends) {
 		ret = -ENOMEM;
 		rdsdebug("send allocation failed\n");
 		goto out;
@@ -325,7 +325,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
 	rds_iw_send_init_ring(ic);
 
 	ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
-	if (ic->i_recvs == NULL) {
+	if (!ic->i_recvs) {
 		ret = -ENOMEM;
 		rdsdebug("recv allocation failed\n");
 		goto out;
@@ -696,7 +696,7 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 
 	/* XXX too lazy? */
 	ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
-	if (ic == NULL)
+	if (!ic)
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&ic->iw_node);
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
index 13dc1862d86..59509e9a9e7 100644
--- a/net/rds/iw_rdma.c
+++ b/net/rds/iw_rdma.c
@@ -34,7 +34,6 @@
 #include <linux/slab.h>
 
 #include "rds.h"
-#include "rdma.h"
 #include "iw.h"
 
 
@@ -158,7 +157,8 @@ static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *
 	return 0;
 }
 
-void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev,
+				struct rdma_cm_id *cm_id)
 {
 	struct rds_iw_cm_id *i_cm_id;
 
@@ -207,9 +207,9 @@ void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *con
 	BUG_ON(list_empty(&ic->iw_node));
 	list_del(&ic->iw_node);
 
-	spin_lock_irq(&rds_iwdev->spinlock);
+	spin_lock(&rds_iwdev->spinlock);
 	list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
-	spin_unlock_irq(&rds_iwdev->spinlock);
+	spin_unlock(&rds_iwdev->spinlock);
 	spin_unlock_irq(&iw_nodev_conns_lock);
 
 	ic->rds_iwdev = rds_iwdev;
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
index 3d479067d54..5e57347f49f 100644
--- a/net/rds/iw_recv.c
+++ b/net/rds/iw_recv.c
@@ -53,7 +53,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
 static void rds_iw_frag_free(struct rds_page_frag *frag)
 {
 	rdsdebug("frag %p page %p\n", frag, frag->f_page);
-	BUG_ON(frag->f_page != NULL);
+	BUG_ON(frag->f_page);
 	kmem_cache_free(rds_iw_frag_slab, frag);
 }
 
@@ -143,14 +143,14 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
 	struct ib_sge *sge;
 	int ret = -ENOMEM;
 
-	if (recv->r_iwinc == NULL) {
+	if (!recv->r_iwinc) {
 		if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
 			rds_iw_stats_inc(s_iw_rx_alloc_limit);
 			goto out;
 		}
 		recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
 						 kptr_gfp);
-		if (recv->r_iwinc == NULL) {
+		if (!recv->r_iwinc) {
 			atomic_dec(&rds_iw_allocation);
 			goto out;
 		}
@@ -158,17 +158,17 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
 		rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
 	}
 
-	if (recv->r_frag == NULL) {
+	if (!recv->r_frag) {
 		recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
-		if (recv->r_frag == NULL)
+		if (!recv->r_frag)
 			goto out;
 		INIT_LIST_HEAD(&recv->r_frag->f_item);
 		recv->r_frag->f_page = NULL;
 	}
 
-	if (ic->i_frag.f_page == NULL) {
+	if (!ic->i_frag.f_page) {
 		ic->i_frag.f_page = alloc_page(page_gfp);
-		if (ic->i_frag.f_page == NULL)
+		if (!ic->i_frag.f_page)
 			goto out;
 		ic->i_frag.f_offset = 0;
 	}
@@ -273,7 +273,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
 	return ret;
 }
 
-void rds_iw_inc_purge(struct rds_incoming *inc)
+static void rds_iw_inc_purge(struct rds_incoming *inc)
 {
 	struct rds_iw_incoming *iwinc;
 	struct rds_page_frag *frag;
@@ -716,7 +716,7 @@ static void rds_iw_process_recv(struct rds_connection *conn,
 	 * into the inc and save the inc so we can hang upcoming fragments
 	 * off its list.
 	 */
-	if (iwinc == NULL) {
+	if (!iwinc) {
 		iwinc = recv->r_iwinc;
 		recv->r_iwinc = NULL;
 		ic->i_iwinc = iwinc;
@@ -887,7 +887,7 @@ int rds_iw_recv(struct rds_connection *conn)
 	return ret;
 }
 
-int __init rds_iw_recv_init(void)
+int rds_iw_recv_init(void)
 {
 	struct sysinfo si;
 	int ret = -ENOMEM;
@@ -899,13 +899,13 @@ int __init rds_iw_recv_init(void)
 	rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
 					sizeof(struct rds_iw_incoming),
 					0, 0, NULL);
-	if (rds_iw_incoming_slab == NULL)
+	if (!rds_iw_incoming_slab)
 		goto out;
 
 	rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
 					sizeof(struct rds_page_frag),
 					0, 0, NULL);
-	if (rds_iw_frag_slab == NULL)
+	if (!rds_iw_frag_slab)
 		kmem_cache_destroy(rds_iw_incoming_slab);
 	else
 		ret = 0;
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index 52182ff7519..6280ea020d4 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -36,7 +36,6 @@
 #include <linux/dmapool.h>
 
 #include "rds.h"
-#include "rdma.h"
 #include "iw.h"
 
 static void rds_iw_send_rdma_complete(struct rds_message *rm,
@@ -64,13 +63,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm,
 }
 
 static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
-				   struct rds_rdma_op *op)
+				   struct rm_rdma_op *op)
 {
-	if (op->r_mapped) {
+	if (op->op_mapped) {
 		ib_dma_unmap_sg(ic->i_cm_id->device,
-			op->r_sg, op->r_nents,
-			op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		op->r_mapped = 0;
+			op->op_sg, op->op_nents,
+			op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		op->op_mapped = 0;
 	}
 }
 
@@ -83,11 +82,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
 	rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
 
 	ib_dma_unmap_sg(ic->i_cm_id->device,
-		     rm->m_sg, rm->m_nents,
+		     rm->data.op_sg, rm->data.op_nents,
 		     DMA_TO_DEVICE);
 
-	if (rm->m_rdma_op != NULL) {
-		rds_iw_send_unmap_rdma(ic, rm->m_rdma_op);
+	if (rm->rdma.op_active) {
+		rds_iw_send_unmap_rdma(ic, &rm->rdma);
 
 		/* If the user asked for a completion notification on this
 		 * message, we can implement three different semantics:
@@ -111,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
 		 */
 		rds_iw_send_rdma_complete(rm, wc_status);
 
-		if (rm->m_rdma_op->r_write)
-			rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
+		if (rm->rdma.op_write)
+			rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
 		else
-			rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
+			rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
 	}
 
 	/* If anyone waited for this message to get flushed out, wake
@@ -556,25 +555,27 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 	}
 
 	/* map the message the first time we see it */
-	if (ic->i_rm == NULL) {
+	if (!ic->i_rm) {
 		/*
 		printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
 				be16_to_cpu(rm->m_inc.i_hdr.h_dport),
 				rm->m_inc.i_hdr.h_flags,
 				be32_to_cpu(rm->m_inc.i_hdr.h_len));
 		   */
-		if (rm->m_nents) {
-			rm->m_count = ib_dma_map_sg(dev,
-					 rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
-			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
-			if (rm->m_count == 0) {
+		if (rm->data.op_nents) {
+			rm->data.op_count = ib_dma_map_sg(dev,
+							  rm->data.op_sg,
+							  rm->data.op_nents,
+							  DMA_TO_DEVICE);
+			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
+			if (rm->data.op_count == 0) {
 				rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
 				rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
 				ret = -ENOMEM; /* XXX ? */
 				goto out;
 			}
 		} else {
-			rm->m_count = 0;
+			rm->data.op_count = 0;
 		}
 
 		ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
@@ -590,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 
 		/* If it has a RDMA op, tell the peer we did it. This is
 		 * used by the peer to release use-once RDMA MRs. */
-		if (rm->m_rdma_op) {
+		if (rm->rdma.op_active) {
 			struct rds_ext_header_rdma ext_hdr;
 
-			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
 			rds_message_add_extension(&rm->m_inc.i_hdr,
 					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
 		}
@@ -621,7 +622,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 	send = &ic->i_sends[pos];
 	first = send;
 	prev = NULL;
-	scat = &rm->m_sg[sg];
+	scat = &rm->data.op_sg[sg];
 	sent = 0;
 	i = 0;
 
@@ -631,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 	 * or when requested by the user. Right now, we let
 	 * the application choose.
 	 */
-	if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
+	if (rm->rdma.op_active && rm->rdma.op_fence)
 		send_flags = IB_SEND_FENCE;
 
 	/*
@@ -650,7 +651,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
 	}
 
 	/* if there's data reference it with a chain of work reqs */
-	for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
+	for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
 		unsigned int len;
 
 		send = &ic->i_sends[pos];
@@ -728,7 +729,7 @@ add_header:
 		sent += sizeof(struct rds_header);
 
 	/* if we finished the message then send completion owns it */
-	if (scat == &rm->m_sg[rm->m_count]) {
+	if (scat == &rm->data.op_sg[rm->data.op_count]) {
 		prev->s_rm = ic->i_rm;
 		prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
 		ic->i_rm = NULL;
@@ -784,7 +785,7 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd
 	ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
 }
 
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 {
 	struct rds_iw_connection *ic = conn->c_transport_data;
 	struct rds_iw_send_work *send = NULL;
@@ -794,7 +795,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	struct rds_iw_device *rds_iwdev;
 	struct scatterlist *scat;
 	unsigned long len;
-	u64 remote_addr = op->r_remote_addr;
+	u64 remote_addr = op->op_remote_addr;
 	u32 pos, fr_pos;
 	u32 work_alloc;
 	u32 i;
@@ -806,21 +807,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
 
 	/* map the message the first time we see it */
-	if (!op->r_mapped) {
-		op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
-					op->r_sg, op->r_nents, (op->r_write) ?
-					DMA_TO_DEVICE : DMA_FROM_DEVICE);
-		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
-		if (op->r_count == 0) {
+	if (!op->op_mapped) {
+		op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
+					     op->op_sg, op->op_nents, (op->op_write) ?
+					     DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
+		if (op->op_count == 0) {
 			rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
 			ret = -ENOMEM; /* XXX ? */
 			goto out;
 		}
 
-		op->r_mapped = 1;
+		op->op_mapped = 1;
 	}
 
-	if (!op->r_write) {
+	if (!op->op_write) {
 		/* Alloc space on the send queue for the fastreg */
 		work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
 		if (work_alloc != 1) {
@@ -835,7 +836,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	 * Instead of knowing how to return a partial rdma read/write we insist that there
 	 * be enough work requests to send the entire message.
 	 */
-	i = ceil(op->r_count, rds_iwdev->max_sge);
+	i = ceil(op->op_count, rds_iwdev->max_sge);
 
 	work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
 	if (work_alloc != i) {
@@ -846,17 +847,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	}
 
 	send = &ic->i_sends[pos];
-	if (!op->r_write) {
+	if (!op->op_write) {
 		first = prev = &ic->i_sends[fr_pos];
 	} else {
 		first = send;
 		prev = NULL;
 	}
-	scat = &op->r_sg[0];
+	scat = &op->op_sg[0];
 	sent = 0;
-	num_sge = op->r_count;
+	num_sge = op->op_count;
 
-	for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
+	for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
 		send->s_wr.send_flags = 0;
 		send->s_queued = jiffies;
 
@@ -873,13 +874,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 		 * for local access after RDS is finished with it, using
 		 * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
 		 */
-		if (op->r_write)
+		if (op->op_write)
 			send->s_wr.opcode = IB_WR_RDMA_WRITE;
 		else
 			send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
 
 		send->s_wr.wr.rdma.remote_addr = remote_addr;
-		send->s_wr.wr.rdma.rkey = op->r_key;
+		send->s_wr.wr.rdma.rkey = op->op_rkey;
 		send->s_op = op;
 
 		if (num_sge > rds_iwdev->max_sge) {
@@ -893,7 +894,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 		if (prev)
 			prev->s_wr.next = &send->s_wr;
 
-		for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
+		for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
 			len = ib_sg_dma_len(ic->i_cm_id->device, scat);
 
 			if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
@@ -927,7 +928,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	}
 
 	/* if we finished the message then send completion owns it */
-	if (scat == &op->r_sg[op->r_count])
+	if (scat == &op->op_sg[op->op_count])
 		first->s_wr.send_flags = IB_SEND_SIGNALED;
 
 	if (i < work_alloc) {
@@ -941,9 +942,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
 	 * adapters do not allow using the lkey for this at all.  To bypass this use a
 	 * fastreg_mr (or possibly a dma_mr)
 	 */
-	if (!op->r_write) {
+	if (!op->op_write) {
 		rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
-			op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
+			op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
 		work_alloc++;
 	}
 
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
index 1c4428a61a0..e2e47176e72 100644
--- a/net/rds/iw_sysctl.c
+++ b/net/rds/iw_sysctl.c
@@ -55,7 +55,7 @@ static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
 
 unsigned int rds_iw_sysctl_flow_control = 1;
 
-ctl_table rds_iw_sysctl_table[] = {
+static ctl_table rds_iw_sysctl_table[] = {
 	{
 		.procname       = "max_send_wr",
 		.data		= &rds_iw_sysctl_max_send_wr,
@@ -122,10 +122,10 @@ void rds_iw_sysctl_exit(void)
 		unregister_sysctl_table(rds_iw_sysctl_hdr);
 }
 
-int __init rds_iw_sysctl_init(void)
+int rds_iw_sysctl_init(void)
 {
 	rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
-	if (rds_iw_sysctl_hdr == NULL)
+	if (!rds_iw_sysctl_hdr)
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/rds/loop.c b/net/rds/loop.c
index dd987937945..c390156b426 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -61,10 +61,17 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
 			 unsigned int hdr_off, unsigned int sg,
 			 unsigned int off)
 {
+	/* Do not send cong updates to loopback */
+	if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
+		rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+		return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
+	}
+
 	BUG_ON(hdr_off || sg || off);
 
 	rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
-	rds_message_addref(rm); /* for the inc */
+	/* For the embedded inc. Matching put is in loop_inc_free() */
+	rds_message_addref(rm);
 
 	rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
 			  GFP_KERNEL, KM_USER0);
@@ -77,16 +84,14 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
 	return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len);
 }
 
-static int rds_loop_xmit_cong_map(struct rds_connection *conn,
-				  struct rds_cong_map *map,
-				  unsigned long offset)
+/*
+ * See rds_loop_xmit(). Since our inc is embedded in the rm, we
+ * make sure the rm lives at least until the inc is done.
+ */
+static void rds_loop_inc_free(struct rds_incoming *inc)
 {
-	BUG_ON(offset);
-	BUG_ON(map != conn->c_lcong);
-
-	rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
-
-	return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
+        struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+        rds_message_put(rm);
 }
 
 /* we need to at least give the thread something to succeed */
@@ -112,7 +117,7 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 	unsigned long flags;
 
 	lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL);
-	if (lc == NULL)
+	if (!lc)
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&lc->loop_node);
@@ -169,14 +174,12 @@ void rds_loop_exit(void)
  */
 struct rds_transport rds_loop_transport = {
 	.xmit			= rds_loop_xmit,
-	.xmit_cong_map		= rds_loop_xmit_cong_map,
 	.recv			= rds_loop_recv,
 	.conn_alloc		= rds_loop_conn_alloc,
 	.conn_free		= rds_loop_conn_free,
 	.conn_connect		= rds_loop_conn_connect,
 	.conn_shutdown		= rds_loop_conn_shutdown,
 	.inc_copy_to_user	= rds_message_inc_copy_to_user,
-	.inc_purge		= rds_message_inc_purge,
-	.inc_free		= rds_message_inc_free,
+	.inc_free		= rds_loop_inc_free,
 	.t_name			= "loopback",
 };
diff --git a/net/rds/message.c b/net/rds/message.c
index 9a1d67e001b..848cff45183 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -34,9 +34,6 @@
 #include <linux/slab.h>
 
 #include "rds.h"
-#include "rdma.h"
-
-static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq);
 
 static unsigned int	rds_exthdr_size[__RDS_EXTHDR_MAX] = {
 [RDS_EXTHDR_NONE]	= 0,
@@ -63,29 +60,31 @@ static void rds_message_purge(struct rds_message *rm)
 	if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
 		return;
 
-	for (i = 0; i < rm->m_nents; i++) {
-		rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i]));
+	for (i = 0; i < rm->data.op_nents; i++) {
+		rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
 		/* XXX will have to put_page for page refs */
-		__free_page(sg_page(&rm->m_sg[i]));
+		__free_page(sg_page(&rm->data.op_sg[i]));
 	}
-	rm->m_nents = 0;
+	rm->data.op_nents = 0;
 
-	if (rm->m_rdma_op)
-		rds_rdma_free_op(rm->m_rdma_op);
-	if (rm->m_rdma_mr)
-		rds_mr_put(rm->m_rdma_mr);
-}
+	if (rm->rdma.op_active)
+		rds_rdma_free_op(&rm->rdma);
+	if (rm->rdma.op_rdma_mr)
+		rds_mr_put(rm->rdma.op_rdma_mr);
 
-void rds_message_inc_purge(struct rds_incoming *inc)
-{
-	struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
-	rds_message_purge(rm);
+	if (rm->atomic.op_active)
+		rds_atomic_free_op(&rm->atomic);
+	if (rm->atomic.op_rdma_mr)
+		rds_mr_put(rm->atomic.op_rdma_mr);
 }
 
 void rds_message_put(struct rds_message *rm)
 {
 	rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
-
+	if (atomic_read(&rm->m_refcount) == 0) {
+printk(KERN_CRIT "danger refcount zero on %p\n", rm);
+WARN_ON(1);
+	}
 	if (atomic_dec_and_test(&rm->m_refcount)) {
 		BUG_ON(!list_empty(&rm->m_sock_item));
 		BUG_ON(!list_empty(&rm->m_conn_item));
@@ -96,12 +95,6 @@ void rds_message_put(struct rds_message *rm)
 }
 EXPORT_SYMBOL_GPL(rds_message_put);
 
-void rds_message_inc_free(struct rds_incoming *inc)
-{
-	struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
-	rds_message_put(rm);
-}
-
 void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
 				 __be16 dport, u64 seq)
 {
@@ -113,8 +106,8 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
 }
 EXPORT_SYMBOL_GPL(rds_message_populate_header);
 
-int rds_message_add_extension(struct rds_header *hdr,
-		unsigned int type, const void *data, unsigned int len)
+int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
+			      const void *data, unsigned int len)
 {
 	unsigned int ext_len = sizeof(u8) + len;
 	unsigned char *dst;
@@ -184,26 +177,6 @@ none:
 	return RDS_EXTHDR_NONE;
 }
 
-int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version)
-{
-	struct rds_ext_header_version ext_hdr;
-
-	ext_hdr.h_version = cpu_to_be32(version);
-	return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr));
-}
-
-int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version)
-{
-	struct rds_ext_header_version ext_hdr;
-	unsigned int pos = 0, len = sizeof(ext_hdr);
-
-	/* We assume the version extension is the only one present */
-	if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION)
-		return 0;
-	*version = be32_to_cpu(ext_hdr.h_version);
-	return 1;
-}
-
 int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
 {
 	struct rds_ext_header_rdma_dest ext_hdr;
@@ -214,41 +187,73 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o
 }
 EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
 
-struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp)
+/*
+ * Each rds_message is allocated with extra space for the scatterlist entries
+ * rds ops will need. This is to minimize memory allocation count. Then, each rds op
+ * can grab SGs when initializing its part of the rds_message.
+ */
+struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
 {
 	struct rds_message *rm;
 
-	rm = kzalloc(sizeof(struct rds_message) +
-		     (nents * sizeof(struct scatterlist)), gfp);
+	rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
 	if (!rm)
 		goto out;
 
-	if (nents)
-		sg_init_table(rm->m_sg, nents);
+	rm->m_used_sgs = 0;
+	rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
+
 	atomic_set(&rm->m_refcount, 1);
 	INIT_LIST_HEAD(&rm->m_sock_item);
 	INIT_LIST_HEAD(&rm->m_conn_item);
 	spin_lock_init(&rm->m_rs_lock);
+	init_waitqueue_head(&rm->m_flush_wait);
 
 out:
 	return rm;
 }
 
+/*
+ * RDS ops use this to grab SG entries from the rm's sg pool.
+ */
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
+{
+	struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
+	struct scatterlist *sg_ret;
+
+	WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
+	WARN_ON(!nents);
+
+	if (rm->m_used_sgs + nents > rm->m_total_sgs)
+		return NULL;
+
+	sg_ret = &sg_first[rm->m_used_sgs];
+	sg_init_table(sg_ret, nents);
+	rm->m_used_sgs += nents;
+
+	return sg_ret;
+}
+
 struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
 {
 	struct rds_message *rm;
 	unsigned int i;
+	int num_sgs = ceil(total_len, PAGE_SIZE);
+	int extra_bytes = num_sgs * sizeof(struct scatterlist);
 
-	rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
-	if (rm == NULL)
+	rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
+	if (!rm)
 		return ERR_PTR(-ENOMEM);
 
 	set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
 	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
-	rm->m_nents = ceil(total_len, PAGE_SIZE);
+	rm->data.op_nents = ceil(total_len, PAGE_SIZE);
+	rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
+	if (!rm->data.op_sg)
+		return ERR_PTR(-ENOMEM);
 
-	for (i = 0; i < rm->m_nents; ++i) {
-		sg_set_page(&rm->m_sg[i],
+	for (i = 0; i < rm->data.op_nents; ++i) {
+		sg_set_page(&rm->data.op_sg[i],
 				virt_to_page(page_addrs[i]),
 				PAGE_SIZE, 0);
 	}
@@ -256,40 +261,33 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
 	return rm;
 }
 
-struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
 					       size_t total_len)
 {
 	unsigned long to_copy;
 	unsigned long iov_off;
 	unsigned long sg_off;
-	struct rds_message *rm;
 	struct iovec *iov;
 	struct scatterlist *sg;
-	int ret;
-
-	rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
-	if (rm == NULL) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	int ret = 0;
 
 	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
 
 	/*
 	 * now allocate and copy in the data payload.
 	 */
-	sg = rm->m_sg;
+	sg = rm->data.op_sg;
 	iov = first_iov;
 	iov_off = 0;
 	sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
 
 	while (total_len) {
-		if (sg_page(sg) == NULL) {
+		if (!sg_page(sg)) {
 			ret = rds_page_remainder_alloc(sg, total_len,
 						       GFP_HIGHUSER);
 			if (ret)
 				goto out;
-			rm->m_nents++;
+			rm->data.op_nents++;
 			sg_off = 0;
 		}
 
@@ -320,14 +318,8 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
 			sg++;
 	}
 
-	ret = 0;
 out:
-	if (ret) {
-		if (rm)
-			rds_message_put(rm);
-		rm = ERR_PTR(ret);
-	}
-	return rm;
+	return ret;
 }
 
 int rds_message_inc_copy_to_user(struct rds_incoming *inc,
@@ -348,7 +340,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
 
 	iov = first_iov;
 	iov_off = 0;
-	sg = rm->m_sg;
+	sg = rm->data.op_sg;
 	vec_off = 0;
 	copied = 0;
 
@@ -394,15 +386,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
  */
 void rds_message_wait(struct rds_message *rm)
 {
-	wait_event(rds_message_flush_waitq,
+	wait_event_interruptible(rm->m_flush_wait,
 			!test_bit(RDS_MSG_MAPPED, &rm->m_flags));
 }
 
 void rds_message_unmapped(struct rds_message *rm)
 {
 	clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
-	if (waitqueue_active(&rds_message_flush_waitq))
-		wake_up(&rds_message_flush_waitq);
+	wake_up_interruptible(&rm->m_flush_wait);
 }
 EXPORT_SYMBOL_GPL(rds_message_unmapped);
 
diff --git a/net/rds/page.c b/net/rds/page.c
index 595a952d4b1..d8acdebe3c7 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -40,7 +40,8 @@ struct rds_page_remainder {
 	unsigned long	r_offset;
 };
 
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder,
+				     rds_page_remainders);
 
 /*
  * returns 0 on success or -errno on failure.
@@ -57,30 +58,17 @@ int rds_page_copy_user(struct page *page, unsigned long offset,
 	unsigned long ret;
 	void *addr;
 
-	if (to_user)
+	addr = kmap(page);
+	if (to_user) {
 		rds_stats_add(s_copy_to_user, bytes);
-	else
+		ret = copy_to_user(ptr, addr + offset, bytes);
+	} else {
 		rds_stats_add(s_copy_from_user, bytes);
-
-	addr = kmap_atomic(page, KM_USER0);
-	if (to_user)
-		ret = __copy_to_user_inatomic(ptr, addr + offset, bytes);
-	else
-		ret = __copy_from_user_inatomic(addr + offset, ptr, bytes);
-	kunmap_atomic(addr, KM_USER0);
-
-	if (ret) {
-		addr = kmap(page);
-		if (to_user)
-			ret = copy_to_user(ptr, addr + offset, bytes);
-		else
-			ret = copy_from_user(addr + offset, ptr, bytes);
-		kunmap(page);
-		if (ret)
-			return -EFAULT;
+		ret = copy_from_user(addr + offset, ptr, bytes);
 	}
+	kunmap(page);
 
-	return 0;
+	return ret ? -EFAULT : 0;
 }
 EXPORT_SYMBOL_GPL(rds_page_copy_user);
 
@@ -116,7 +104,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
 	/* jump straight to allocation if we're trying for a huge page */
 	if (bytes >= PAGE_SIZE) {
 		page = alloc_page(gfp);
-		if (page == NULL) {
+		if (!page) {
 			ret = -ENOMEM;
 		} else {
 			sg_set_page(scat, page, PAGE_SIZE, 0);
@@ -162,7 +150,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
 		rem = &per_cpu(rds_page_remainders, get_cpu());
 		local_irq_save(flags);
 
-		if (page == NULL) {
+		if (!page) {
 			ret = -ENOMEM;
 			break;
 		}
@@ -186,6 +174,7 @@ out:
 		 ret ? 0 : scat->length);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
 
 static int rds_page_remainder_cpu_notify(struct notifier_block *self,
 					 unsigned long action, void *hcpu)
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 75fd13bb631..8920f2a8332 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -35,7 +35,7 @@
 #include <linux/rbtree.h>
 #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
 
-#include "rdma.h"
+#include "rds.h"
 
 /*
  * XXX
@@ -130,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
 {
 	struct rds_mr *mr;
 	struct rb_node *node;
+	unsigned long flags;
 
 	/* Release any MRs associated with this socket */
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 	while ((node = rb_first(&rs->rs_rdma_keys))) {
 		mr = container_of(node, struct rds_mr, r_rb_node);
 		if (mr->r_trans == rs->rs_transport)
 			mr->r_invalidate = 0;
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+		RB_CLEAR_NODE(&mr->r_rb_node);
+		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+		rds_destroy_mr(mr);
 		rds_mr_put(mr);
+		spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 	}
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 
 	if (rs->rs_transport && rs->rs_transport->flush_mrs)
 		rs->rs_transport->flush_mrs();
@@ -181,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
 		goto out;
 	}
 
-	if (rs->rs_transport->get_mr == NULL) {
+	if (!rs->rs_transport->get_mr) {
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
@@ -197,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
 
 	/* XXX clamp nr_pages to limit the size of this alloc? */
 	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
-	if (pages == NULL) {
+	if (!pages) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
 	mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
-	if (mr == NULL) {
+	if (!mr) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -230,13 +238,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
 	 * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
 	 * the zero page.
 	 */
-	ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1);
+	ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
 	if (ret < 0)
 		goto out;
 
 	nents = ret;
 	sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
-	if (sg == NULL) {
+	if (!sg) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -406,68 +414,153 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
 
 	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
-	if (mr && (mr->r_use_once || force)) {
+	if (!mr) {
+		printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
+		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+		return;
+	}
+
+	if (mr->r_use_once || force) {
 		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
 		RB_CLEAR_NODE(&mr->r_rb_node);
 		zot_me = 1;
-	} else if (mr)
-		atomic_inc(&mr->r_refcount);
+	}
 	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 
 	/* May have to issue a dma_sync on this memory region.
 	 * Note we could avoid this if the operation was a RDMA READ,
 	 * but at this point we can't tell. */
-	if (mr != NULL) {
-		if (mr->r_trans->sync_mr)
-			mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
-
-		/* If the MR was marked as invalidate, this will
-		 * trigger an async flush. */
-		if (zot_me)
-			rds_destroy_mr(mr);
-		rds_mr_put(mr);
-	}
+	if (mr->r_trans->sync_mr)
+		mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+
+	/* If the MR was marked as invalidate, this will
+	 * trigger an async flush. */
+	if (zot_me)
+		rds_destroy_mr(mr);
+	rds_mr_put(mr);
 }
 
-void rds_rdma_free_op(struct rds_rdma_op *ro)
+void rds_rdma_free_op(struct rm_rdma_op *ro)
 {
 	unsigned int i;
 
-	for (i = 0; i < ro->r_nents; i++) {
-		struct page *page = sg_page(&ro->r_sg[i]);
+	for (i = 0; i < ro->op_nents; i++) {
+		struct page *page = sg_page(&ro->op_sg[i]);
 
 		/* Mark page dirty if it was possibly modified, which
 		 * is the case for a RDMA_READ which copies from remote
 		 * to local memory */
-		if (!ro->r_write) {
-			BUG_ON(in_interrupt());
+		if (!ro->op_write) {
+			BUG_ON(irqs_disabled());
 			set_page_dirty(page);
 		}
 		put_page(page);
 	}
 
-	kfree(ro->r_notifier);
-	kfree(ro);
+	kfree(ro->op_notifier);
+	ro->op_notifier = NULL;
+	ro->op_active = 0;
+}
+
+void rds_atomic_free_op(struct rm_atomic_op *ao)
+{
+	struct page *page = sg_page(ao->op_sg);
+
+	/* Mark page dirty if it was possibly modified, which
+	 * is the case for a RDMA_READ which copies from remote
+	 * to local memory */
+	set_page_dirty(page);
+	put_page(page);
+
+	kfree(ao->op_notifier);
+	ao->op_notifier = NULL;
+	ao->op_active = 0;
 }
 
+
 /*
- * args is a pointer to an in-kernel copy in the sendmsg cmsg.
+ * Count the number of pages needed to describe an incoming iovec array.
  */
-static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
-					    struct rds_rdma_args *args)
+static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
+{
+	int tot_pages = 0;
+	unsigned int nr_pages;
+	unsigned int i;
+
+	/* figure out the number of pages in the vector */
+	for (i = 0; i < nr_iovecs; i++) {
+		nr_pages = rds_pages_in_vec(&iov[i]);
+		if (nr_pages == 0)
+			return -EINVAL;
+
+		tot_pages += nr_pages;
+
+		/*
+		 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
+		 * so tot_pages cannot overflow without first going negative.
+		 */
+		if (tot_pages < 0)
+			return -EINVAL;
+	}
+
+	return tot_pages;
+}
+
+int rds_rdma_extra_size(struct rds_rdma_args *args)
 {
 	struct rds_iovec vec;
-	struct rds_rdma_op *op = NULL;
+	struct rds_iovec __user *local_vec;
+	int tot_pages = 0;
 	unsigned int nr_pages;
-	unsigned int max_pages;
+	unsigned int i;
+
+	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
+	/* figure out the number of pages in the vector */
+	for (i = 0; i < args->nr_local; i++) {
+		if (copy_from_user(&vec, &local_vec[i],
+				   sizeof(struct rds_iovec)))
+			return -EFAULT;
+
+		nr_pages = rds_pages_in_vec(&vec);
+		if (nr_pages == 0)
+			return -EINVAL;
+
+		tot_pages += nr_pages;
+
+		/*
+		 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
+		 * so tot_pages cannot overflow without first going negative.
+		 */
+		if (tot_pages < 0)
+			return -EINVAL;
+	}
+
+	return tot_pages * sizeof(struct scatterlist);
+}
+
+/*
+ * The application asks for a RDMA transfer.
+ * Extract all arguments and set up the rdma_op
+ */
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
+{
+	struct rds_rdma_args *args;
+	struct rm_rdma_op *op = &rm->rdma;
+	int nr_pages;
 	unsigned int nr_bytes;
 	struct page **pages = NULL;
-	struct rds_iovec __user *local_vec;
-	struct scatterlist *sg;
-	unsigned int nr;
+	struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack;
+	int iov_size;
 	unsigned int i, j;
-	int ret;
+	int ret = 0;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
+	    || rm->rdma.op_active)
+		return -EINVAL;
 
+	args = CMSG_DATA(cmsg);
 
 	if (rs->rs_bound_addr == 0) {
 		ret = -ENOTCONN; /* XXX not a great errno */
@@ -479,61 +572,59 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
 		goto out;
 	}
 
-	nr_pages = 0;
-	max_pages = 0;
-
-	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
-
-	/* figure out the number of pages in the vector */
-	for (i = 0; i < args->nr_local; i++) {
-		if (copy_from_user(&vec, &local_vec[i],
-				   sizeof(struct rds_iovec))) {
-			ret = -EFAULT;
-			goto out;
-		}
-
-		nr = rds_pages_in_vec(&vec);
-		if (nr == 0) {
-			ret = -EINVAL;
+	/* Check whether to allocate the iovec area */
+	iov_size = args->nr_local * sizeof(struct rds_iovec);
+	if (args->nr_local > UIO_FASTIOV) {
+		iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL);
+		if (!iovs) {
+			ret = -ENOMEM;
 			goto out;
 		}
+	}
 
-		max_pages = max(nr, max_pages);
-		nr_pages += nr;
+	if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) {
+		ret = -EFAULT;
+		goto out;
 	}
 
-	pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
-	if (pages == NULL) {
-		ret = -ENOMEM;
+	nr_pages = rds_rdma_pages(iovs, args->nr_local);
+	if (nr_pages < 0) {
+		ret = -EINVAL;
 		goto out;
 	}
 
-	op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL);
-	if (op == NULL) {
+	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
-	op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
-	op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
-	op->r_recverr = rs->rs_recverr;
+	op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
+	op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
+	op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+	op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
+	op->op_active = 1;
+	op->op_recverr = rs->rs_recverr;
 	WARN_ON(!nr_pages);
-	sg_init_table(op->r_sg, nr_pages);
+	op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
+	if (!op->op_sg) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
-	if (op->r_notify || op->r_recverr) {
+	if (op->op_notify || op->op_recverr) {
 		/* We allocate an uninitialized notifier here, because
 		 * we don't want to do that in the completion handler. We
 		 * would have to use GFP_ATOMIC there, and don't want to deal
 		 * with failed allocations.
 		 */
-		op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
-		if (!op->r_notifier) {
+		op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
+		if (!op->op_notifier) {
 			ret = -ENOMEM;
 			goto out;
 		}
-		op->r_notifier->n_user_token = args->user_token;
-		op->r_notifier->n_status = RDS_RDMA_SUCCESS;
+		op->op_notifier->n_user_token = args->user_token;
+		op->op_notifier->n_status = RDS_RDMA_SUCCESS;
 	}
 
 	/* The cookie contains the R_Key of the remote memory region, and
@@ -543,68 +634,55 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
 	 * destination address (which is really an offset into the MR)
 	 * FIXME: We may want to move this into ib_rdma.c
 	 */
-	op->r_key = rds_rdma_cookie_key(args->cookie);
-	op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
+	op->op_rkey = rds_rdma_cookie_key(args->cookie);
+	op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
 
 	nr_bytes = 0;
 
 	rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
 	       (unsigned long long)args->nr_local,
 	       (unsigned long long)args->remote_vec.addr,
-	       op->r_key);
+	       op->op_rkey);
 
 	for (i = 0; i < args->nr_local; i++) {
-		if (copy_from_user(&vec, &local_vec[i],
-				   sizeof(struct rds_iovec))) {
-			ret = -EFAULT;
-			goto out;
-		}
+		struct rds_iovec *iov = &iovs[i];
+		/* don't need to check, rds_rdma_pages() verified nr will be +nonzero */
+		unsigned int nr = rds_pages_in_vec(iov);
 
-		nr = rds_pages_in_vec(&vec);
-		if (nr == 0) {
-			ret = -EINVAL;
-			goto out;
-		}
+		rs->rs_user_addr = iov->addr;
+		rs->rs_user_bytes = iov->bytes;
 
-		rs->rs_user_addr = vec.addr;
-		rs->rs_user_bytes = vec.bytes;
-
-		/* did the user change the vec under us? */
-		if (nr > max_pages || op->r_nents + nr > nr_pages) {
-			ret = -EINVAL;
-			goto out;
-		}
 		/* If it's a WRITE operation, we want to pin the pages for reading.
 		 * If it's a READ operation, we need to pin the pages for writing.
 		 */
-		ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write);
+		ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
 		if (ret < 0)
 			goto out;
 
-		rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n",
-		       nr_bytes, nr, vec.bytes, vec.addr);
+		rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
+			 nr_bytes, nr, iov->bytes, iov->addr);
 
-		nr_bytes += vec.bytes;
+		nr_bytes += iov->bytes;
 
 		for (j = 0; j < nr; j++) {
-			unsigned int offset = vec.addr & ~PAGE_MASK;
+			unsigned int offset = iov->addr & ~PAGE_MASK;
+			struct scatterlist *sg;
 
-			sg = &op->r_sg[op->r_nents + j];
+			sg = &op->op_sg[op->op_nents + j];
 			sg_set_page(sg, pages[j],
-					min_t(unsigned int, vec.bytes, PAGE_SIZE - offset),
+					min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
 					offset);
 
-			rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n",
-			       sg->offset, sg->length, vec.addr, vec.bytes);
+			rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
+			       sg->offset, sg->length, iov->addr, iov->bytes);
 
-			vec.addr += sg->length;
-			vec.bytes -= sg->length;
+			iov->addr += sg->length;
+			iov->bytes -= sg->length;
 		}
 
-		op->r_nents += nr;
+		op->op_nents += nr;
 	}
 
-
 	if (nr_bytes > args->remote_vec.bytes) {
 		rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
 				nr_bytes,
@@ -612,38 +690,18 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
 		ret = -EINVAL;
 		goto out;
 	}
-	op->r_bytes = nr_bytes;
+	op->op_bytes = nr_bytes;
 
-	ret = 0;
 out:
+	if (iovs != iovstack)
+		sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size);
 	kfree(pages);
-	if (ret) {
-		if (op)
-			rds_rdma_free_op(op);
-		op = ERR_PTR(ret);
-	}
-	return op;
-}
-
-/*
- * The application asks for a RDMA transfer.
- * Extract all arguments and set up the rdma_op
- */
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
-			  struct cmsghdr *cmsg)
-{
-	struct rds_rdma_op *op;
-
-	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) ||
-	    rm->m_rdma_op != NULL)
-		return -EINVAL;
+	if (ret)
+		rds_rdma_free_op(op);
+	else
+		rds_stats_inc(s_send_rdma);
 
-	op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
-	if (IS_ERR(op))
-		return PTR_ERR(op);
-	rds_stats_inc(s_send_rdma);
-	rm->m_rdma_op = op;
-	return 0;
+	return ret;
 }
 
 /*
@@ -673,7 +731,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
 
 	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
-	if (mr == NULL)
+	if (!mr)
 		err = -EINVAL;	/* invalid r_key */
 	else
 		atomic_inc(&mr->r_refcount);
@@ -681,7 +739,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
 
 	if (mr) {
 		mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
-		rm->m_rdma_mr = mr;
+		rm->rdma.op_rdma_mr = mr;
 	}
 	return err;
 }
@@ -699,5 +757,102 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
 	    rm->m_rdma_cookie != 0)
 		return -EINVAL;
 
-	return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr);
+	return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
+}
+
+/*
+ * Fill in rds_message for an atomic request.
+ */
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+		    struct cmsghdr *cmsg)
+{
+	struct page *page = NULL;
+	struct rds_atomic_args *args;
+	int ret = 0;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
+	 || rm->atomic.op_active)
+		return -EINVAL;
+
+	args = CMSG_DATA(cmsg);
+
+	/* Nonmasked & masked cmsg ops converted to masked hw ops */
+	switch (cmsg->cmsg_type) {
+	case RDS_CMSG_ATOMIC_FADD:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+		rm->atomic.op_m_fadd.add = args->fadd.add;
+		rm->atomic.op_m_fadd.nocarry_mask = 0;
+		break;
+	case RDS_CMSG_MASKED_ATOMIC_FADD:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+		rm->atomic.op_m_fadd.add = args->m_fadd.add;
+		rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
+		break;
+	case RDS_CMSG_ATOMIC_CSWP:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+		rm->atomic.op_m_cswp.compare = args->cswp.compare;
+		rm->atomic.op_m_cswp.swap = args->cswp.swap;
+		rm->atomic.op_m_cswp.compare_mask = ~0;
+		rm->atomic.op_m_cswp.swap_mask = ~0;
+		break;
+	case RDS_CMSG_MASKED_ATOMIC_CSWP:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+		rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
+		rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
+		rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
+		rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
+		break;
+	default:
+		BUG(); /* should never happen */
+	}
+
+	rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+	rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
+	rm->atomic.op_active = 1;
+	rm->atomic.op_recverr = rs->rs_recverr;
+	rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
+	if (!rm->atomic.op_sg) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/* verify 8 byte-aligned */
+	if (args->local_addr & 0x7) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	ret = rds_pin_pages(args->local_addr, 1, &page, 1);
+	if (ret != 1)
+		goto err;
+	ret = 0;
+
+	sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
+
+	if (rm->atomic.op_notify || rm->atomic.op_recverr) {
+		/* We allocate an uninitialized notifier here, because
+		 * we don't want to do that in the completion handler. We
+		 * would have to use GFP_ATOMIC there, and don't want to deal
+		 * with failed allocations.
+		 */
+		rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
+		if (!rm->atomic.op_notifier) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		rm->atomic.op_notifier->n_user_token = args->user_token;
+		rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
+	}
+
+	rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
+	rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
+
+	return ret;
+err:
+	if (page)
+		put_page(page);
+	kfree(rm->atomic.op_notifier);
+
+	return ret;
 }
diff --git a/net/rds/rdma.h b/net/rds/rdma.h
deleted file mode 100644
index 909c39835a5..00000000000
--- a/net/rds/rdma.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef _RDS_RDMA_H
-#define _RDS_RDMA_H
-
-#include <linux/rbtree.h>
-#include <linux/spinlock.h>
-#include <linux/scatterlist.h>
-
-#include "rds.h"
-
-struct rds_mr {
-	struct rb_node		r_rb_node;
-	atomic_t		r_refcount;
-	u32			r_key;
-
-	/* A copy of the creation flags */
-	unsigned int		r_use_once:1;
-	unsigned int		r_invalidate:1;
-	unsigned int		r_write:1;
-
-	/* This is for RDS_MR_DEAD.
-	 * It would be nice & consistent to make this part of the above
-	 * bit field here, but we need to use test_and_set_bit.
-	 */
-	unsigned long		r_state;
-	struct rds_sock		*r_sock; /* back pointer to the socket that owns us */
-	struct rds_transport	*r_trans;
-	void			*r_trans_private;
-};
-
-/* Flags for mr->r_state */
-#define RDS_MR_DEAD		0
-
-struct rds_rdma_op {
-	u32			r_key;
-	u64			r_remote_addr;
-	unsigned int		r_write:1;
-	unsigned int		r_fence:1;
-	unsigned int		r_notify:1;
-	unsigned int		r_recverr:1;
-	unsigned int		r_mapped:1;
-	struct rds_notifier	*r_notifier;
-	unsigned int		r_bytes;
-	unsigned int		r_nents;
-	unsigned int		r_count;
-	struct scatterlist	r_sg[0];
-};
-
-static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
-{
-	return r_key | (((u64) offset) << 32);
-}
-
-static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
-{
-	return cookie;
-}
-
-static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
-{
-	return cookie >> 32;
-}
-
-int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
-void rds_rdma_drop_keys(struct rds_sock *rs);
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
-			  struct cmsghdr *cmsg);
-int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
-			  struct cmsghdr *cmsg);
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
-			  struct cmsghdr *cmsg);
-int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
-			  struct cmsghdr *cmsg);
-void rds_rdma_free_op(struct rds_rdma_op *ro);
-void rds_rdma_send_complete(struct rds_message *rm, int);
-
-extern void __rds_put_mr_final(struct rds_mr *mr);
-static inline void rds_mr_put(struct rds_mr *mr)
-{
-	if (atomic_dec_and_test(&mr->r_refcount))
-		__rds_put_mr_final(mr);
-}
-
-#endif
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index e599ba2f950..4195a053982 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -36,6 +36,34 @@
 
 static struct rdma_cm_id *rds_rdma_listen_id;
 
+static char *rds_cm_event_strings[] = {
+#define RDS_CM_EVENT_STRING(foo) \
+		[RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo)
+	RDS_CM_EVENT_STRING(ADDR_RESOLVED),
+	RDS_CM_EVENT_STRING(ADDR_ERROR),
+	RDS_CM_EVENT_STRING(ROUTE_RESOLVED),
+	RDS_CM_EVENT_STRING(ROUTE_ERROR),
+	RDS_CM_EVENT_STRING(CONNECT_REQUEST),
+	RDS_CM_EVENT_STRING(CONNECT_RESPONSE),
+	RDS_CM_EVENT_STRING(CONNECT_ERROR),
+	RDS_CM_EVENT_STRING(UNREACHABLE),
+	RDS_CM_EVENT_STRING(REJECTED),
+	RDS_CM_EVENT_STRING(ESTABLISHED),
+	RDS_CM_EVENT_STRING(DISCONNECTED),
+	RDS_CM_EVENT_STRING(DEVICE_REMOVAL),
+	RDS_CM_EVENT_STRING(MULTICAST_JOIN),
+	RDS_CM_EVENT_STRING(MULTICAST_ERROR),
+	RDS_CM_EVENT_STRING(ADDR_CHANGE),
+	RDS_CM_EVENT_STRING(TIMEWAIT_EXIT),
+#undef RDS_CM_EVENT_STRING
+};
+
+static char *rds_cm_event_str(enum rdma_cm_event_type type)
+{
+	return rds_str_array(rds_cm_event_strings,
+			     ARRAY_SIZE(rds_cm_event_strings), type);
+};
+
 int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 			      struct rdma_cm_event *event)
 {
@@ -44,8 +72,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 	struct rds_transport *trans;
 	int ret = 0;
 
-	rdsdebug("conn %p id %p handling event %u\n", conn, cm_id,
-		 event->event);
+	rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
+		 event->event, rds_cm_event_str(event->event));
 
 	if (cm_id->device->node_type == RDMA_NODE_RNIC)
 		trans = &rds_iw_transport;
@@ -109,7 +137,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 
 	default:
 		/* things like device disconnect? */
-		printk(KERN_ERR "RDS: unknown event %u!\n", event->event);
+		printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
+		       event->event, rds_cm_event_str(event->event));
 		break;
 	}
 
@@ -117,12 +146,13 @@ out:
 	if (conn)
 		mutex_unlock(&conn->c_cm_lock);
 
-	rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret);
+	rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
+		 rds_cm_event_str(event->event), ret);
 
 	return ret;
 }
 
-static int __init rds_rdma_listen_init(void)
+static int rds_rdma_listen_init(void)
 {
 	struct sockaddr_in sin;
 	struct rdma_cm_id *cm_id;
@@ -177,7 +207,7 @@ static void rds_rdma_listen_stop(void)
 	}
 }
 
-int __init rds_rdma_init(void)
+static int rds_rdma_init(void)
 {
 	int ret;
 
@@ -204,7 +234,7 @@ out:
 }
 module_init(rds_rdma_init);
 
-void rds_rdma_exit(void)
+static void rds_rdma_exit(void)
 {
 	/* stop listening first to ensure no new connections are attempted */
 	rds_rdma_listen_stop();
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
index 2f2c7d976c2..faba4e38269 100644
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -11,10 +11,6 @@ int rds_rdma_conn_connect(struct rds_connection *conn);
 int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 			      struct rdma_cm_event *event);
 
-/* from rdma_transport.c */
-int rds_rdma_init(void);
-void rds_rdma_exit(void);
-
 /* from ib.c */
 extern struct rds_transport rds_ib_transport;
 int rds_ib_init(void);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index c224b5bb3ba..9542449c072 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -80,6 +80,7 @@ enum {
 /* Bits for c_flags */
 #define RDS_LL_SEND_FULL	0
 #define RDS_RECONNECT_PENDING	1
+#define RDS_IN_XMIT		2
 
 struct rds_connection {
 	struct hlist_node	c_hash_node;
@@ -91,12 +92,13 @@ struct rds_connection {
 	struct rds_cong_map	*c_lcong;
 	struct rds_cong_map	*c_fcong;
 
-	struct mutex		c_send_lock;	/* protect send ring */
 	struct rds_message	*c_xmit_rm;
 	unsigned long		c_xmit_sg;
 	unsigned int		c_xmit_hdr_off;
 	unsigned int		c_xmit_data_off;
+	unsigned int		c_xmit_atomic_sent;
 	unsigned int		c_xmit_rdma_sent;
+	unsigned int		c_xmit_data_sent;
 
 	spinlock_t		c_lock;		/* protect msg queues */
 	u64			c_next_tx_seq;
@@ -116,11 +118,10 @@ struct rds_connection {
 	struct delayed_work	c_conn_w;
 	struct work_struct	c_down_w;
 	struct mutex		c_cm_lock;	/* protect conn state & cm */
+	wait_queue_head_t	c_waitq;
 
 	struct list_head	c_map_item;
 	unsigned long		c_map_queued;
-	unsigned long		c_map_offset;
-	unsigned long		c_map_bytes;
 
 	unsigned int		c_unacked_packets;
 	unsigned int		c_unacked_bytes;
@@ -206,6 +207,48 @@ struct rds_incoming {
 	rds_rdma_cookie_t	i_rdma_cookie;
 };
 
+struct rds_mr {
+	struct rb_node		r_rb_node;
+	atomic_t		r_refcount;
+	u32			r_key;
+
+	/* A copy of the creation flags */
+	unsigned int		r_use_once:1;
+	unsigned int		r_invalidate:1;
+	unsigned int		r_write:1;
+
+	/* This is for RDS_MR_DEAD.
+	 * It would be nice & consistent to make this part of the above
+	 * bit field here, but we need to use test_and_set_bit.
+	 */
+	unsigned long		r_state;
+	struct rds_sock		*r_sock; /* back pointer to the socket that owns us */
+	struct rds_transport	*r_trans;
+	void			*r_trans_private;
+};
+
+/* Flags for mr->r_state */
+#define RDS_MR_DEAD		0
+
+static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
+{
+	return r_key | (((u64) offset) << 32);
+}
+
+static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
+{
+	return cookie;
+}
+
+static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
+{
+	return cookie >> 32;
+}
+
+/* atomic operation types */
+#define RDS_ATOMIC_TYPE_CSWP		0
+#define RDS_ATOMIC_TYPE_FADD		1
+
 /*
  * m_sock_item and m_conn_item are on lists that are serialized under
  * conn->c_lock.  m_sock_item has additional meaning in that once it is empty
@@ -258,13 +301,71 @@ struct rds_message {
 	 *   -> rs->rs_lock
 	 */
 	spinlock_t		m_rs_lock;
+	wait_queue_head_t	m_flush_wait;
+
 	struct rds_sock		*m_rs;
-	struct rds_rdma_op	*m_rdma_op;
+
+	/* cookie to send to remote, in rds header */
 	rds_rdma_cookie_t	m_rdma_cookie;
-	struct rds_mr		*m_rdma_mr;
-	unsigned int		m_nents;
-	unsigned int		m_count;
-	struct scatterlist	m_sg[0];
+
+	unsigned int		m_used_sgs;
+	unsigned int		m_total_sgs;
+
+	void			*m_final_op;
+
+	struct {
+		struct rm_atomic_op {
+			int			op_type;
+			union {
+				struct {
+					uint64_t	compare;
+					uint64_t	swap;
+					uint64_t	compare_mask;
+					uint64_t	swap_mask;
+				} op_m_cswp;
+				struct {
+					uint64_t	add;
+					uint64_t	nocarry_mask;
+				} op_m_fadd;
+			};
+
+			u32			op_rkey;
+			u64			op_remote_addr;
+			unsigned int		op_notify:1;
+			unsigned int		op_recverr:1;
+			unsigned int		op_mapped:1;
+			unsigned int		op_silent:1;
+			unsigned int		op_active:1;
+			struct scatterlist	*op_sg;
+			struct rds_notifier	*op_notifier;
+
+			struct rds_mr		*op_rdma_mr;
+		} atomic;
+		struct rm_rdma_op {
+			u32			op_rkey;
+			u64			op_remote_addr;
+			unsigned int		op_write:1;
+			unsigned int		op_fence:1;
+			unsigned int		op_notify:1;
+			unsigned int		op_recverr:1;
+			unsigned int		op_mapped:1;
+			unsigned int		op_silent:1;
+			unsigned int		op_active:1;
+			unsigned int		op_bytes;
+			unsigned int		op_nents;
+			unsigned int		op_count;
+			struct scatterlist	*op_sg;
+			struct rds_notifier	*op_notifier;
+
+			struct rds_mr		*op_rdma_mr;
+		} rdma;
+		struct rm_data_op {
+			unsigned int		op_active:1;
+			unsigned int		op_nents;
+			unsigned int		op_count;
+			struct scatterlist	*op_sg;
+		} data;
+	};
 };
 
 /*
@@ -305,10 +406,6 @@ struct rds_notifier {
  *                 transport is responsible for other serialization, including
  *                 rds_recv_incoming().  This is called in process context but
  *                 should try hard not to block.
- *
- * @xmit_cong_map: This asks the transport to send the local bitmap down the
- * 		   given connection.  XXX get a better story about the bitmap
- * 		   flag and header.
  */
 
 #define RDS_TRANS_IB	0
@@ -332,13 +429,11 @@ struct rds_transport {
 	void (*xmit_complete)(struct rds_connection *conn);
 	int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
 		    unsigned int hdr_off, unsigned int sg, unsigned int off);
-	int (*xmit_cong_map)(struct rds_connection *conn,
-			     struct rds_cong_map *map, unsigned long offset);
-	int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
+	int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
+	int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
 	int (*recv)(struct rds_connection *conn);
 	int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
 				size_t size);
-	void (*inc_purge)(struct rds_incoming *inc);
 	void (*inc_free)(struct rds_incoming *inc);
 
 	int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
@@ -367,17 +462,11 @@ struct rds_sock {
 	 * bound_addr used for both incoming and outgoing, no INADDR_ANY
 	 * support.
 	 */
-	struct rb_node		rs_bound_node;
+	struct hlist_node	rs_bound_node;
 	__be32			rs_bound_addr;
 	__be32			rs_conn_addr;
 	__be16			rs_bound_port;
 	__be16			rs_conn_port;
-
-	/*
-	 * This is only used to communicate the transport between bind and
-	 * initiating connections.  All other trans use is referenced through
-	 * the connection.
-	 */
 	struct rds_transport    *rs_transport;
 
 	/*
@@ -466,8 +555,8 @@ struct rds_statistics {
 	uint64_t	s_recv_ping;
 	uint64_t	s_send_queue_empty;
 	uint64_t	s_send_queue_full;
-	uint64_t	s_send_sem_contention;
-	uint64_t	s_send_sem_queue_raced;
+	uint64_t	s_send_lock_contention;
+	uint64_t	s_send_lock_queue_raced;
 	uint64_t	s_send_immediate_retry;
 	uint64_t	s_send_delayed_retry;
 	uint64_t	s_send_drop_acked;
@@ -487,6 +576,7 @@ struct rds_statistics {
 };
 
 /* af_rds.c */
+char *rds_str_array(char **array, size_t elements, size_t index);
 void rds_sock_addref(struct rds_sock *rs);
 void rds_sock_put(struct rds_sock *rs);
 void rds_wake_sk_sleep(struct rds_sock *rs);
@@ -521,15 +611,16 @@ void rds_cong_exit(void);
 struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
 
 /* conn.c */
-int __init rds_conn_init(void);
+int rds_conn_init(void);
 void rds_conn_exit(void);
 struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
 				       struct rds_transport *trans, gfp_t gfp);
 struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
 			       struct rds_transport *trans, gfp_t gfp);
+void rds_conn_shutdown(struct rds_connection *conn);
 void rds_conn_destroy(struct rds_connection *conn);
-void rds_conn_reset(struct rds_connection *conn);
 void rds_conn_drop(struct rds_connection *conn);
+void rds_conn_connect_if_down(struct rds_connection *conn);
 void rds_for_each_conn_info(struct socket *sock, unsigned int len,
 			  struct rds_info_iterator *iter,
 			  struct rds_info_lengths *lens,
@@ -566,7 +657,8 @@ rds_conn_connecting(struct rds_connection *conn)
 
 /* message.c */
 struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
-struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
+int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
 					       size_t total_len);
 struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
 void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
@@ -575,12 +667,9 @@ int rds_message_add_extension(struct rds_header *hdr,
 			      unsigned int type, const void *data, unsigned int len);
 int rds_message_next_extension(struct rds_header *hdr,
 			       unsigned int *pos, void *buf, unsigned int *buflen);
-int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version);
-int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version);
 int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
 int rds_message_inc_copy_to_user(struct rds_incoming *inc,
 				 struct iovec *first_iov, size_t size);
-void rds_message_inc_purge(struct rds_incoming *inc);
 void rds_message_inc_free(struct rds_incoming *inc);
 void rds_message_addref(struct rds_message *rm);
 void rds_message_put(struct rds_message *rm);
@@ -614,7 +703,6 @@ void rds_page_exit(void);
 /* recv.c */
 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
 		  __be32 saddr);
-void rds_inc_addref(struct rds_incoming *inc);
 void rds_inc_put(struct rds_incoming *inc);
 void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
 		       struct rds_incoming *inc, gfp_t gfp, enum km_type km);
@@ -636,14 +724,38 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
 typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
 void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
 			 is_acked_func is_acked);
-int rds_send_acked_before(struct rds_connection *conn, u64 seq);
-void rds_send_remove_from_sock(struct list_head *messages, int status);
 int rds_send_pong(struct rds_connection *conn, __be16 dport);
 struct rds_message *rds_send_get_message(struct rds_connection *,
-					 struct rds_rdma_op *);
+					 struct rm_rdma_op *);
 
 /* rdma.c */
 void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+void rds_rdma_drop_keys(struct rds_sock *rs);
+int rds_rdma_extra_size(struct rds_rdma_args *args);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+void rds_rdma_free_op(struct rm_rdma_op *ro);
+void rds_atomic_free_op(struct rm_atomic_op *ao);
+void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
+void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+		    struct cmsghdr *cmsg);
+
+extern void __rds_put_mr_final(struct rds_mr *mr);
+static inline void rds_mr_put(struct rds_mr *mr)
+{
+	if (atomic_dec_and_test(&mr->r_refcount))
+		__rds_put_mr_final(mr);
+}
 
 /* stats.c */
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
@@ -657,14 +769,14 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
 	put_cpu();					\
 } while (0)
 #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
-int __init rds_stats_init(void);
+int rds_stats_init(void);
 void rds_stats_exit(void);
 void rds_stats_info_copy(struct rds_info_iterator *iter,
 			 uint64_t *values, const char *const *names,
 			 size_t nr);
 
 /* sysctl.c */
-int __init rds_sysctl_init(void);
+int rds_sysctl_init(void);
 void rds_sysctl_exit(void);
 extern unsigned long rds_sysctl_sndbuf_min;
 extern unsigned long rds_sysctl_sndbuf_default;
@@ -678,9 +790,10 @@ extern unsigned long rds_sysctl_trace_flags;
 extern unsigned int  rds_sysctl_trace_level;
 
 /* threads.c */
-int __init rds_threads_init(void);
+int rds_threads_init(void);
 void rds_threads_exit(void);
 extern struct workqueue_struct *rds_wq;
+void rds_queue_reconnect(struct rds_connection *conn);
 void rds_connect_worker(struct work_struct *);
 void rds_shutdown_worker(struct work_struct *);
 void rds_send_worker(struct work_struct *);
@@ -691,9 +804,10 @@ void rds_connect_complete(struct rds_connection *conn);
 int rds_trans_register(struct rds_transport *trans);
 void rds_trans_unregister(struct rds_transport *trans);
 struct rds_transport *rds_trans_get_preferred(__be32 addr);
+void rds_trans_put(struct rds_transport *trans);
 unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
 				       unsigned int avail);
-int __init rds_trans_init(void);
+int rds_trans_init(void);
 void rds_trans_exit(void);
 
 #endif
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 795a00b7f2c..596689e5927 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -36,7 +36,6 @@
 #include <linux/in.h>
 
 #include "rds.h"
-#include "rdma.h"
 
 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
 		  __be32 saddr)
@@ -49,12 +48,11 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
 }
 EXPORT_SYMBOL_GPL(rds_inc_init);
 
-void rds_inc_addref(struct rds_incoming *inc)
+static void rds_inc_addref(struct rds_incoming *inc)
 {
 	rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
 	atomic_inc(&inc->i_refcount);
 }
-EXPORT_SYMBOL_GPL(rds_inc_addref);
 
 void rds_inc_put(struct rds_incoming *inc)
 {
@@ -210,7 +208,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
 	}
 
 	rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
-	if (rs == NULL) {
+	if (!rs) {
 		rds_stats_inc(s_recv_drop_no_sock);
 		goto out;
 	}
@@ -251,7 +249,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
 {
 	unsigned long flags;
 
-	if (*inc == NULL) {
+	if (!*inc) {
 		read_lock_irqsave(&rs->rs_recv_lock, flags);
 		if (!list_empty(&rs->rs_recv_queue)) {
 			*inc = list_entry(rs->rs_recv_queue.next,
@@ -297,7 +295,7 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
 int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
 {
 	struct rds_notifier *notifier;
-	struct rds_rdma_notify cmsg;
+	struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */
 	unsigned int count = 0, max_messages = ~0U;
 	unsigned long flags;
 	LIST_HEAD(copy);
@@ -334,10 +332,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
 
 		if (msghdr) {
 			cmsg.user_token = notifier->n_user_token;
-			cmsg.status  = notifier->n_status;
+			cmsg.status = notifier->n_status;
 
 			err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
-					sizeof(cmsg), &cmsg);
+				       sizeof(cmsg), &cmsg);
 			if (err)
 				break;
 		}
diff --git a/net/rds/send.c b/net/rds/send.c
index 9c1c6bcaa6c..35b9c2e9caf 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -37,7 +37,6 @@
 #include <linux/list.h>
 
 #include "rds.h"
-#include "rdma.h"
 
 /* When transmitting messages in rds_send_xmit, we need to emerge from
  * time to time and briefly release the CPU. Otherwise the softlock watchdog
@@ -53,8 +52,11 @@ static int send_batch_count = 64;
 module_param(send_batch_count, int, 0444);
 MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
 
+static void rds_send_remove_from_sock(struct list_head *messages, int status);
+
 /*
- * Reset the send state. Caller must hold c_send_lock when calling here.
+ * Reset the send state.  Callers must ensure that this doesn't race with
+ * rds_send_xmit().
  */
 void rds_send_reset(struct rds_connection *conn)
 {
@@ -62,18 +64,22 @@ void rds_send_reset(struct rds_connection *conn)
 	unsigned long flags;
 
 	if (conn->c_xmit_rm) {
+		rm = conn->c_xmit_rm;
+		conn->c_xmit_rm = NULL;
 		/* Tell the user the RDMA op is no longer mapped by the
 		 * transport. This isn't entirely true (it's flushed out
 		 * independently) but as the connection is down, there's
 		 * no ongoing RDMA to/from that memory */
-		rds_message_unmapped(conn->c_xmit_rm);
-		rds_message_put(conn->c_xmit_rm);
-		conn->c_xmit_rm = NULL;
+		rds_message_unmapped(rm);
+		rds_message_put(rm);
 	}
+
 	conn->c_xmit_sg = 0;
 	conn->c_xmit_hdr_off = 0;
 	conn->c_xmit_data_off = 0;
+	conn->c_xmit_atomic_sent = 0;
 	conn->c_xmit_rdma_sent = 0;
+	conn->c_xmit_data_sent = 0;
 
 	conn->c_map_queued = 0;
 
@@ -90,6 +96,25 @@ void rds_send_reset(struct rds_connection *conn)
 	spin_unlock_irqrestore(&conn->c_lock, flags);
 }
 
+static int acquire_in_xmit(struct rds_connection *conn)
+{
+	return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0;
+}
+
+static void release_in_xmit(struct rds_connection *conn)
+{
+	clear_bit(RDS_IN_XMIT, &conn->c_flags);
+	smp_mb__after_clear_bit();
+	/*
+	 * We don't use wait_on_bit()/wake_up_bit() because our waking is in a
+	 * hot path and finding waiters is very rare.  We don't want to walk
+	 * the system-wide hashed waitqueue buckets in the fast path only to
+	 * almost never find waiters.
+	 */
+	if (waitqueue_active(&conn->c_waitq))
+		wake_up_all(&conn->c_waitq);
+}
+
 /*
  * We're making the concious trade-off here to only send one message
  * down the connection at a time.
@@ -109,102 +134,69 @@ int rds_send_xmit(struct rds_connection *conn)
 	struct rds_message *rm;
 	unsigned long flags;
 	unsigned int tmp;
-	unsigned int send_quota = send_batch_count;
 	struct scatterlist *sg;
 	int ret = 0;
-	int was_empty = 0;
 	LIST_HEAD(to_be_dropped);
 
+restart:
+
 	/*
 	 * sendmsg calls here after having queued its message on the send
 	 * queue.  We only have one task feeding the connection at a time.  If
 	 * another thread is already feeding the queue then we back off.  This
 	 * avoids blocking the caller and trading per-connection data between
 	 * caches per message.
-	 *
-	 * The sem holder will issue a retry if they notice that someone queued
-	 * a message after they stopped walking the send queue but before they
-	 * dropped the sem.
 	 */
-	if (!mutex_trylock(&conn->c_send_lock)) {
-		rds_stats_inc(s_send_sem_contention);
+	if (!acquire_in_xmit(conn)) {
+		rds_stats_inc(s_send_lock_contention);
 		ret = -ENOMEM;
 		goto out;
 	}
 
+	/*
+	 * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
+	 * we do the opposite to avoid races.
+	 */
+	if (!rds_conn_up(conn)) {
+		release_in_xmit(conn);
+		ret = 0;
+		goto out;
+	}
+
 	if (conn->c_trans->xmit_prepare)
 		conn->c_trans->xmit_prepare(conn);
 
 	/*
 	 * spin trying to push headers and data down the connection until
-	 * the connection doens't make forward progress.
+	 * the connection doesn't make forward progress.
 	 */
-	while (--send_quota) {
-		/*
-		 * See if need to send a congestion map update if we're
-		 * between sending messages.  The send_sem protects our sole
-		 * use of c_map_offset and _bytes.
-		 * Note this is used only by transports that define a special
-		 * xmit_cong_map function. For all others, we create allocate
-		 * a cong_map message and treat it just like any other send.
-		 */
-		if (conn->c_map_bytes) {
-			ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
-						conn->c_map_offset);
-			if (ret <= 0)
-				break;
+	while (1) {
 
-			conn->c_map_offset += ret;
-			conn->c_map_bytes -= ret;
-			if (conn->c_map_bytes)
-				continue;
-		}
-
-		/* If we're done sending the current message, clear the
-		 * offset and S/G temporaries.
-		 */
 		rm = conn->c_xmit_rm;
-		if (rm != NULL &&
-		    conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
-		    conn->c_xmit_sg == rm->m_nents) {
-			conn->c_xmit_rm = NULL;
-			conn->c_xmit_sg = 0;
-			conn->c_xmit_hdr_off = 0;
-			conn->c_xmit_data_off = 0;
-			conn->c_xmit_rdma_sent = 0;
-
-			/* Release the reference to the previous message. */
-			rds_message_put(rm);
-			rm = NULL;
-		}
 
-		/* If we're asked to send a cong map update, do so.
+		/*
+		 * If between sending messages, we can send a pending congestion
+		 * map update.
 		 */
-		if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
-			if (conn->c_trans->xmit_cong_map != NULL) {
-				conn->c_map_offset = 0;
-				conn->c_map_bytes = sizeof(struct rds_header) +
-					RDS_CONG_MAP_BYTES;
-				continue;
-			}
-
+		if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
 			rm = rds_cong_update_alloc(conn);
 			if (IS_ERR(rm)) {
 				ret = PTR_ERR(rm);
 				break;
 			}
+			rm->data.op_active = 1;
 
 			conn->c_xmit_rm = rm;
 		}
 
 		/*
-		 * Grab the next message from the send queue, if there is one.
+		 * If not already working on one, grab the next message.
 		 *
 		 * c_xmit_rm holds a ref while we're sending this message down
 		 * the connction.  We can use this ref while holding the
 		 * send_sem.. rds_send_reset() is serialized with it.
 		 */
-		if (rm == NULL) {
+		if (!rm) {
 			unsigned int len;
 
 			spin_lock_irqsave(&conn->c_lock, flags);
@@ -224,10 +216,8 @@ int rds_send_xmit(struct rds_connection *conn)
 
 			spin_unlock_irqrestore(&conn->c_lock, flags);
 
-			if (rm == NULL) {
-				was_empty = 1;
+			if (!rm)
 				break;
-			}
 
 			/* Unfortunately, the way Infiniband deals with
 			 * RDMA to a bad MR key is by moving the entire
@@ -236,13 +226,12 @@ int rds_send_xmit(struct rds_connection *conn)
 			 * connection.
 			 * Therefore, we never retransmit messages with RDMA ops.
 			 */
-			if (rm->m_rdma_op &&
+			if (rm->rdma.op_active &&
 			    test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
 				spin_lock_irqsave(&conn->c_lock, flags);
 				if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
 					list_move(&rm->m_conn_item, &to_be_dropped);
 				spin_unlock_irqrestore(&conn->c_lock, flags);
-				rds_message_put(rm);
 				continue;
 			}
 
@@ -263,23 +252,55 @@ int rds_send_xmit(struct rds_connection *conn)
 			conn->c_xmit_rm = rm;
 		}
 
-		/*
-		 * Try and send an rdma message.  Let's see if we can
-		 * keep this simple and require that the transport either
-		 * send the whole rdma or none of it.
-		 */
-		if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
-			ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
+		/* The transport either sends the whole rdma or none of it */
+		if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
+			rm->m_final_op = &rm->rdma;
+			ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
 			if (ret)
 				break;
 			conn->c_xmit_rdma_sent = 1;
+
+			/* The transport owns the mapped memory for now.
+			 * You can't unmap it while it's on the send queue */
+			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+		}
+
+		if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
+			rm->m_final_op = &rm->atomic;
+			ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
+			if (ret)
+				break;
+			conn->c_xmit_atomic_sent = 1;
+
 			/* The transport owns the mapped memory for now.
 			 * You can't unmap it while it's on the send queue */
 			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
 		}
 
-		if (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
-		    conn->c_xmit_sg < rm->m_nents) {
+		/*
+		 * A number of cases require an RDS header to be sent
+		 * even if there is no data.
+		 * We permit 0-byte sends; rds-ping depends on this.
+		 * However, if there are exclusively attached silent ops,
+		 * we skip the hdr/data send, to enable silent operation.
+		 */
+		if (rm->data.op_nents == 0) {
+			int ops_present;
+			int all_ops_are_silent = 1;
+
+			ops_present = (rm->atomic.op_active || rm->rdma.op_active);
+			if (rm->atomic.op_active && !rm->atomic.op_silent)
+				all_ops_are_silent = 0;
+			if (rm->rdma.op_active && !rm->rdma.op_silent)
+				all_ops_are_silent = 0;
+
+			if (ops_present && all_ops_are_silent
+			    && !rm->m_rdma_cookie)
+				rm->data.op_active = 0;
+		}
+
+		if (rm->data.op_active && !conn->c_xmit_data_sent) {
+			rm->m_final_op = &rm->data;
 			ret = conn->c_trans->xmit(conn, rm,
 						  conn->c_xmit_hdr_off,
 						  conn->c_xmit_sg,
@@ -295,7 +316,7 @@ int rds_send_xmit(struct rds_connection *conn)
 				ret -= tmp;
 			}
 
-			sg = &rm->m_sg[conn->c_xmit_sg];
+			sg = &rm->data.op_sg[conn->c_xmit_sg];
 			while (ret) {
 				tmp = min_t(int, ret, sg->length -
 						      conn->c_xmit_data_off);
@@ -306,49 +327,63 @@ int rds_send_xmit(struct rds_connection *conn)
 					sg++;
 					conn->c_xmit_sg++;
 					BUG_ON(ret != 0 &&
-					       conn->c_xmit_sg == rm->m_nents);
+					       conn->c_xmit_sg == rm->data.op_nents);
 				}
 			}
+
+			if (conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
+			    (conn->c_xmit_sg == rm->data.op_nents))
+				conn->c_xmit_data_sent = 1;
 		}
-	}
 
-	/* Nuke any messages we decided not to retransmit. */
-	if (!list_empty(&to_be_dropped))
-		rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
+		/*
+		 * A rm will only take multiple times through this loop
+		 * if there is a data op. Thus, if the data is sent (or there was
+		 * none), then we're done with the rm.
+		 */
+		if (!rm->data.op_active || conn->c_xmit_data_sent) {
+			conn->c_xmit_rm = NULL;
+			conn->c_xmit_sg = 0;
+			conn->c_xmit_hdr_off = 0;
+			conn->c_xmit_data_off = 0;
+			conn->c_xmit_rdma_sent = 0;
+			conn->c_xmit_atomic_sent = 0;
+			conn->c_xmit_data_sent = 0;
+
+			rds_message_put(rm);
+		}
+	}
 
 	if (conn->c_trans->xmit_complete)
 		conn->c_trans->xmit_complete(conn);
 
-	/*
-	 * We might be racing with another sender who queued a message but
-	 * backed off on noticing that we held the c_send_lock.  If we check
-	 * for queued messages after dropping the sem then either we'll
-	 * see the queued message or the queuer will get the sem.  If we
-	 * notice the queued message then we trigger an immediate retry.
-	 *
-	 * We need to be careful only to do this when we stopped processing
-	 * the send queue because it was empty.  It's the only way we
-	 * stop processing the loop when the transport hasn't taken
-	 * responsibility for forward progress.
-	 */
-	mutex_unlock(&conn->c_send_lock);
+	release_in_xmit(conn);
 
-	if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) {
-		/* We exhausted the send quota, but there's work left to
-		 * do. Return and (re-)schedule the send worker.
-		 */
-		ret = -EAGAIN;
+	/* Nuke any messages we decided not to retransmit. */
+	if (!list_empty(&to_be_dropped)) {
+		/* irqs on here, so we can put(), unlike above */
+		list_for_each_entry(rm, &to_be_dropped, m_conn_item)
+			rds_message_put(rm);
+		rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
 	}
 
-	if (ret == 0 && was_empty) {
-		/* A simple bit test would be way faster than taking the
-		 * spin lock */
-		spin_lock_irqsave(&conn->c_lock, flags);
+	/*
+	 * Other senders can queue a message after we last test the send queue
+	 * but before we clear RDS_IN_XMIT.  In that case they'd back off and
+	 * not try and send their newly queued message.  We need to check the
+	 * send queue after having cleared RDS_IN_XMIT so that their message
+	 * doesn't get stuck on the send queue.
+	 *
+	 * If the transport cannot continue (i.e ret != 0), then it must
+	 * call us when more room is available, such as from the tx
+	 * completion handler.
+	 */
+	if (ret == 0) {
+		smp_mb();
 		if (!list_empty(&conn->c_send_queue)) {
-			rds_stats_inc(s_send_sem_queue_raced);
-			ret = -EAGAIN;
+			rds_stats_inc(s_send_lock_queue_raced);
+			goto restart;
 		}
-		spin_unlock_irqrestore(&conn->c_lock, flags);
 	}
 out:
 	return ret;
@@ -376,52 +411,60 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
 }
 
 /*
- * Returns true if there are no messages on the send and retransmit queues
- * which have a sequence number greater than or equal to the given sequence
- * number.
+ * This is pretty similar to what happens below in the ACK
+ * handling code - except that we call here as soon as we get
+ * the IB send completion on the RDMA op and the accompanying
+ * message.
  */
-int rds_send_acked_before(struct rds_connection *conn, u64 seq)
+void rds_rdma_send_complete(struct rds_message *rm, int status)
 {
-	struct rds_message *rm, *tmp;
-	int ret = 1;
+	struct rds_sock *rs = NULL;
+	struct rm_rdma_op *ro;
+	struct rds_notifier *notifier;
+	unsigned long flags;
 
-	spin_lock(&conn->c_lock);
+	spin_lock_irqsave(&rm->m_rs_lock, flags);
 
-	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
-		if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
-			ret = 0;
-		break;
-	}
+	ro = &rm->rdma;
+	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
+	    ro->op_active && ro->op_notify && ro->op_notifier) {
+		notifier = ro->op_notifier;
+		rs = rm->m_rs;
+		sock_hold(rds_rs_to_sk(rs));
 
-	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
-		if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
-			ret = 0;
-		break;
+		notifier->n_status = status;
+		spin_lock(&rs->rs_lock);
+		list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+		spin_unlock(&rs->rs_lock);
+
+		ro->op_notifier = NULL;
 	}
 
-	spin_unlock(&conn->c_lock);
+	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 
-	return ret;
+	if (rs) {
+		rds_wake_sk_sleep(rs);
+		sock_put(rds_rs_to_sk(rs));
+	}
 }
+EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
 
 /*
- * This is pretty similar to what happens below in the ACK
- * handling code - except that we call here as soon as we get
- * the IB send completion on the RDMA op and the accompanying
- * message.
+ * Just like above, except looks at atomic op
  */
-void rds_rdma_send_complete(struct rds_message *rm, int status)
+void rds_atomic_send_complete(struct rds_message *rm, int status)
 {
 	struct rds_sock *rs = NULL;
-	struct rds_rdma_op *ro;
+	struct rm_atomic_op *ao;
 	struct rds_notifier *notifier;
+	unsigned long flags;
 
-	spin_lock(&rm->m_rs_lock);
+	spin_lock_irqsave(&rm->m_rs_lock, flags);
 
-	ro = rm->m_rdma_op;
-	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
-	    ro && ro->r_notify && ro->r_notifier) {
-		notifier = ro->r_notifier;
+	ao = &rm->atomic;
+	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
+	    && ao->op_active && ao->op_notify && ao->op_notifier) {
+		notifier = ao->op_notifier;
 		rs = rm->m_rs;
 		sock_hold(rds_rs_to_sk(rs));
 
@@ -430,17 +473,17 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
 		list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
 		spin_unlock(&rs->rs_lock);
 
-		ro->r_notifier = NULL;
+		ao->op_notifier = NULL;
 	}
 
-	spin_unlock(&rm->m_rs_lock);
+	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 
 	if (rs) {
 		rds_wake_sk_sleep(rs);
 		sock_put(rds_rs_to_sk(rs));
 	}
 }
-EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
+EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
 
 /*
  * This is the same as rds_rdma_send_complete except we
@@ -448,15 +491,23 @@ EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
  * socket, socket lock) and can just move the notifier.
  */
 static inline void
-__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
+__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
 {
-	struct rds_rdma_op *ro;
+	struct rm_rdma_op *ro;
+	struct rm_atomic_op *ao;
+
+	ro = &rm->rdma;
+	if (ro->op_active && ro->op_notify && ro->op_notifier) {
+		ro->op_notifier->n_status = status;
+		list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
+		ro->op_notifier = NULL;
+	}
 
-	ro = rm->m_rdma_op;
-	if (ro && ro->r_notify && ro->r_notifier) {
-		ro->r_notifier->n_status = status;
-		list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
-		ro->r_notifier = NULL;
+	ao = &rm->atomic;
+	if (ao->op_active && ao->op_notify && ao->op_notifier) {
+		ao->op_notifier->n_status = status;
+		list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
+		ao->op_notifier = NULL;
 	}
 
 	/* No need to wake the app - caller does this */
@@ -468,7 +519,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status
  * So speed is not an issue here.
  */
 struct rds_message *rds_send_get_message(struct rds_connection *conn,
-					 struct rds_rdma_op *op)
+					 struct rm_rdma_op *op)
 {
 	struct rds_message *rm, *tmp, *found = NULL;
 	unsigned long flags;
@@ -476,7 +527,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
 	spin_lock_irqsave(&conn->c_lock, flags);
 
 	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
-		if (rm->m_rdma_op == op) {
+		if (&rm->rdma == op) {
 			atomic_inc(&rm->m_refcount);
 			found = rm;
 			goto out;
@@ -484,7 +535,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
 	}
 
 	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
-		if (rm->m_rdma_op == op) {
+		if (&rm->rdma == op) {
 			atomic_inc(&rm->m_refcount);
 			found = rm;
 			break;
@@ -506,7 +557,7 @@ EXPORT_SYMBOL_GPL(rds_send_get_message);
  * removing the messages from the 'messages' list regardless of if it found
  * the messages on the socket list or not.
  */
-void rds_send_remove_from_sock(struct list_head *messages, int status)
+static void rds_send_remove_from_sock(struct list_head *messages, int status)
 {
 	unsigned long flags;
 	struct rds_sock *rs = NULL;
@@ -544,19 +595,20 @@ void rds_send_remove_from_sock(struct list_head *messages, int status)
 		spin_lock(&rs->rs_lock);
 
 		if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
-			struct rds_rdma_op *ro = rm->m_rdma_op;
+			struct rm_rdma_op *ro = &rm->rdma;
 			struct rds_notifier *notifier;
 
 			list_del_init(&rm->m_sock_item);
 			rds_send_sndbuf_remove(rs, rm);
 
-			if (ro && ro->r_notifier && (status || ro->r_notify)) {
-				notifier = ro->r_notifier;
+			if (ro->op_active && ro->op_notifier &&
+			       (ro->op_notify || (ro->op_recverr && status))) {
+				notifier = ro->op_notifier;
 				list_add_tail(&notifier->n_list,
 						&rs->rs_notify_queue);
 				if (!notifier->n_status)
 					notifier->n_status = status;
-				rm->m_rdma_op->r_notifier = NULL;
+				rm->rdma.op_notifier = NULL;
 			}
 			was_on_sock = 1;
 			rm->m_rs = NULL;
@@ -619,9 +671,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 {
 	struct rds_message *rm, *tmp;
 	struct rds_connection *conn;
-	unsigned long flags, flags2;
+	unsigned long flags;
 	LIST_HEAD(list);
-	int wake = 0;
 
 	/* get all the messages we're dropping under the rs lock */
 	spin_lock_irqsave(&rs->rs_lock, flags);
@@ -631,59 +682,54 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 			     dest->sin_port != rm->m_inc.i_hdr.h_dport))
 			continue;
 
-		wake = 1;
 		list_move(&rm->m_sock_item, &list);
 		rds_send_sndbuf_remove(rs, rm);
 		clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
 	}
 
 	/* order flag updates with the rs lock */
-	if (wake)
-		smp_mb__after_clear_bit();
+	smp_mb__after_clear_bit();
 
 	spin_unlock_irqrestore(&rs->rs_lock, flags);
 
-	conn = NULL;
+	if (list_empty(&list))
+		return;
 
-	/* now remove the messages from the conn list as needed */
+	/* Remove the messages from the conn */
 	list_for_each_entry(rm, &list, m_sock_item) {
-		/* We do this here rather than in the loop above, so that
-		 * we don't have to nest m_rs_lock under rs->rs_lock */
-		spin_lock_irqsave(&rm->m_rs_lock, flags2);
-		/* If this is a RDMA operation, notify the app. */
-		spin_lock(&rs->rs_lock);
-		__rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
-		spin_unlock(&rs->rs_lock);
-		rm->m_rs = NULL;
-		spin_unlock_irqrestore(&rm->m_rs_lock, flags2);
 
+		conn = rm->m_inc.i_conn;
+
+		spin_lock_irqsave(&conn->c_lock, flags);
 		/*
-		 * If we see this flag cleared then we're *sure* that someone
-		 * else beat us to removing it from the conn.  If we race
-		 * with their flag update we'll get the lock and then really
-		 * see that the flag has been cleared.
+		 * Maybe someone else beat us to removing rm from the conn.
+		 * If we race with their flag update we'll get the lock and
+		 * then really see that the flag has been cleared.
 		 */
-		if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags))
+		if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
+			spin_unlock_irqrestore(&conn->c_lock, flags);
 			continue;
-
-		if (conn != rm->m_inc.i_conn) {
-			if (conn)
-				spin_unlock_irqrestore(&conn->c_lock, flags);
-			conn = rm->m_inc.i_conn;
-			spin_lock_irqsave(&conn->c_lock, flags);
 		}
+		list_del_init(&rm->m_conn_item);
+		spin_unlock_irqrestore(&conn->c_lock, flags);
 
-		if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
-			list_del_init(&rm->m_conn_item);
-			rds_message_put(rm);
-		}
-	}
+		/*
+		 * Couldn't grab m_rs_lock in top loop (lock ordering),
+		 * but we can now.
+		 */
+		spin_lock_irqsave(&rm->m_rs_lock, flags);
 
-	if (conn)
-		spin_unlock_irqrestore(&conn->c_lock, flags);
+		spin_lock(&rs->rs_lock);
+		__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
+		spin_unlock(&rs->rs_lock);
 
-	if (wake)
-		rds_wake_sk_sleep(rs);
+		rm->m_rs = NULL;
+		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+		rds_message_put(rm);
+	}
+
+	rds_wake_sk_sleep(rs);
 
 	while (!list_empty(&list)) {
 		rm = list_entry(list.next, struct rds_message, m_sock_item);
@@ -763,6 +809,63 @@ out:
 	return *queued;
 }
 
+/*
+ * rds_message is getting to be quite complicated, and we'd like to allocate
+ * it all in one go. This figures out how big it needs to be up front.
+ */
+static int rds_rm_size(struct msghdr *msg, int data_len)
+{
+	struct cmsghdr *cmsg;
+	int size = 0;
+	int cmsg_groups = 0;
+	int retval;
+
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_RDS)
+			continue;
+
+		switch (cmsg->cmsg_type) {
+		case RDS_CMSG_RDMA_ARGS:
+			cmsg_groups |= 1;
+			retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
+			if (retval < 0)
+				return retval;
+			size += retval;
+
+			break;
+
+		case RDS_CMSG_RDMA_DEST:
+		case RDS_CMSG_RDMA_MAP:
+			cmsg_groups |= 2;
+			/* these are valid but do no add any size */
+			break;
+
+		case RDS_CMSG_ATOMIC_CSWP:
+		case RDS_CMSG_ATOMIC_FADD:
+		case RDS_CMSG_MASKED_ATOMIC_CSWP:
+		case RDS_CMSG_MASKED_ATOMIC_FADD:
+			cmsg_groups |= 1;
+			size += sizeof(struct scatterlist);
+			break;
+
+		default:
+			return -EINVAL;
+		}
+
+	}
+
+	size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
+
+	/* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
+	if (cmsg_groups == 3)
+		return -EINVAL;
+
+	return size;
+}
+
 static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
 			 struct msghdr *msg, int *allocated_mr)
 {
@@ -777,7 +880,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
 			continue;
 
 		/* As a side effect, RDMA_DEST and RDMA_MAP will set
-		 * rm->m_rdma_cookie and rm->m_rdma_mr.
+		 * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
 		 */
 		switch (cmsg->cmsg_type) {
 		case RDS_CMSG_RDMA_ARGS:
@@ -793,6 +896,12 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
 			if (!ret)
 				*allocated_mr = 1;
 			break;
+		case RDS_CMSG_ATOMIC_CSWP:
+		case RDS_CMSG_ATOMIC_FADD:
+		case RDS_CMSG_MASKED_ATOMIC_CSWP:
+		case RDS_CMSG_MASKED_ATOMIC_FADD:
+			ret = rds_cmsg_atomic(rs, rm, cmsg);
+			break;
 
 		default:
 			return -EINVAL;
@@ -850,13 +959,30 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 		goto out;
 	}
 
-	rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
-	if (IS_ERR(rm)) {
-		ret = PTR_ERR(rm);
-		rm = NULL;
+	/* size of rm including all sgs */
+	ret = rds_rm_size(msg, payload_len);
+	if (ret < 0)
+		goto out;
+
+	rm = rds_message_alloc(ret, GFP_KERNEL);
+	if (!rm) {
+		ret = -ENOMEM;
 		goto out;
 	}
 
+	/* Attach data to the rm */
+	if (payload_len) {
+		rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
+		if (!rm->data.op_sg) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len);
+		if (ret)
+			goto out;
+	}
+	rm->data.op_active = 1;
+
 	rm->m_daddr = daddr;
 
 	/* rds_conn_create has a spinlock that runs with IRQ off.
@@ -879,22 +1005,23 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	if (ret)
 		goto out;
 
-	if ((rm->m_rdma_cookie || rm->m_rdma_op) &&
-	    conn->c_trans->xmit_rdma == NULL) {
+	if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
 		if (printk_ratelimit())
 			printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
-				rm->m_rdma_op, conn->c_trans->xmit_rdma);
+			       &rm->rdma, conn->c_trans->xmit_rdma);
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
 
-	/* If the connection is down, trigger a connect. We may
-	 * have scheduled a delayed reconnect however - in this case
-	 * we should not interfere.
-	 */
-	if (rds_conn_state(conn) == RDS_CONN_DOWN &&
-	    !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
-		queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+	if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
+		if (printk_ratelimit())
+			printk(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
+			       &rm->atomic, conn->c_trans->xmit_atomic);
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	rds_conn_connect_if_down(conn);
 
 	ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
 	if (ret) {
@@ -938,7 +1065,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 	rds_stats_inc(s_send_queued);
 
 	if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
-		rds_send_worker(&conn->c_send_w.work);
+		rds_send_xmit(conn);
 
 	rds_message_put(rm);
 	return payload_len;
@@ -966,20 +1093,15 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
 	int ret = 0;
 
 	rm = rds_message_alloc(0, GFP_ATOMIC);
-	if (rm == NULL) {
+	if (!rm) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
 	rm->m_daddr = conn->c_faddr;
+	rm->data.op_active = 1;
 
-	/* If the connection is down, trigger a connect. We may
-	 * have scheduled a delayed reconnect however - in this case
-	 * we should not interfere.
-	 */
-	if (rds_conn_state(conn) == RDS_CONN_DOWN &&
-	    !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
-		queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+	rds_conn_connect_if_down(conn);
 
 	ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
 	if (ret)
@@ -999,7 +1121,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
 	rds_stats_inc(s_send_queued);
 	rds_stats_inc(s_send_pong);
 
-	queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+	if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+		rds_send_xmit(conn);
+
 	rds_message_put(rm);
 	return 0;
 
diff --git a/net/rds/stats.c b/net/rds/stats.c
index 7598eb07cfb..10c759ccac0 100644
--- a/net/rds/stats.c
+++ b/net/rds/stats.c
@@ -57,8 +57,8 @@ static const char *const rds_stat_names[] = {
 	"recv_ping",
 	"send_queue_empty",
 	"send_queue_full",
-	"send_sem_contention",
-	"send_sem_queue_raced",
+	"send_lock_contention",
+	"send_lock_queue_raced",
 	"send_immediate_retry",
 	"send_delayed_retry",
 	"send_drop_acked",
@@ -143,7 +143,7 @@ void rds_stats_exit(void)
 	rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
 }
 
-int __init rds_stats_init(void)
+int rds_stats_init(void)
 {
 	rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
 	return 0;
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
index 7829a20325d..25ad0c77a26 100644
--- a/net/rds/sysctl.c
+++ b/net/rds/sysctl.c
@@ -105,13 +105,13 @@ void rds_sysctl_exit(void)
 		unregister_sysctl_table(rds_sysctl_reg_table);
 }
 
-int __init rds_sysctl_init(void)
+int rds_sysctl_init(void)
 {
 	rds_sysctl_reconnect_min = msecs_to_jiffies(1);
 	rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
 
 	rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table);
-	if (rds_sysctl_reg_table == NULL)
+	if (!rds_sysctl_reg_table)
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index babf4577ff7..08a8c6cf2d1 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -41,7 +41,7 @@
 /* only for info exporting */
 static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
 static LIST_HEAD(rds_tcp_tc_list);
-unsigned int rds_tcp_tc_count;
+static unsigned int rds_tcp_tc_count;
 
 /* Track rds_tcp_connection structs so they can be cleaned up */
 static DEFINE_SPINLOCK(rds_tcp_conn_lock);
@@ -200,7 +200,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 	struct rds_tcp_connection *tc;
 
 	tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
-	if (tc == NULL)
+	if (!tc)
 		return -ENOMEM;
 
 	tc->t_sock = NULL;
@@ -243,7 +243,7 @@ static void rds_tcp_destroy_conns(void)
 	}
 }
 
-void rds_tcp_exit(void)
+static void rds_tcp_exit(void)
 {
 	rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
 	rds_tcp_listen_stop();
@@ -258,7 +258,6 @@ struct rds_transport rds_tcp_transport = {
 	.laddr_check		= rds_tcp_laddr_check,
 	.xmit_prepare		= rds_tcp_xmit_prepare,
 	.xmit_complete		= rds_tcp_xmit_complete,
-	.xmit_cong_map		= rds_tcp_xmit_cong_map,
 	.xmit			= rds_tcp_xmit,
 	.recv			= rds_tcp_recv,
 	.conn_alloc		= rds_tcp_conn_alloc,
@@ -266,7 +265,6 @@ struct rds_transport rds_tcp_transport = {
 	.conn_connect		= rds_tcp_conn_connect,
 	.conn_shutdown		= rds_tcp_conn_shutdown,
 	.inc_copy_to_user	= rds_tcp_inc_copy_to_user,
-	.inc_purge		= rds_tcp_inc_purge,
 	.inc_free		= rds_tcp_inc_free,
 	.stats_info_copy	= rds_tcp_stats_info_copy,
 	.exit			= rds_tcp_exit,
@@ -276,14 +274,14 @@ struct rds_transport rds_tcp_transport = {
 	.t_prefer_loopback	= 1,
 };
 
-int __init rds_tcp_init(void)
+static int rds_tcp_init(void)
 {
 	int ret;
 
 	rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
 					      sizeof(struct rds_tcp_connection),
 					      0, 0, NULL);
-	if (rds_tcp_conn_slab == NULL) {
+	if (!rds_tcp_conn_slab) {
 		ret = -ENOMEM;
 		goto out;
 	}
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 844fa6b9cf5..9cf2927d002 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -43,8 +43,6 @@ struct rds_tcp_statistics {
 };
 
 /* tcp.c */
-int __init rds_tcp_init(void);
-void rds_tcp_exit(void);
 void rds_tcp_tune(struct socket *sock);
 void rds_tcp_nonagle(struct socket *sock);
 void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
@@ -61,16 +59,15 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn);
 void rds_tcp_state_change(struct sock *sk);
 
 /* tcp_listen.c */
-int __init rds_tcp_listen_init(void);
+int rds_tcp_listen_init(void);
 void rds_tcp_listen_stop(void);
 void rds_tcp_listen_data_ready(struct sock *sk, int bytes);
 
 /* tcp_recv.c */
-int __init rds_tcp_recv_init(void);
+int rds_tcp_recv_init(void);
 void rds_tcp_recv_exit(void);
 void rds_tcp_data_ready(struct sock *sk, int bytes);
 int rds_tcp_recv(struct rds_connection *conn);
-void rds_tcp_inc_purge(struct rds_incoming *inc);
 void rds_tcp_inc_free(struct rds_incoming *inc);
 int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
 			     size_t size);
@@ -81,8 +78,6 @@ void rds_tcp_xmit_complete(struct rds_connection *conn);
 int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
 	         unsigned int hdr_off, unsigned int sg, unsigned int off);
 void rds_tcp_write_space(struct sock *sk);
-int rds_tcp_xmit_cong_map(struct rds_connection *conn,
-			  struct rds_cong_map *map, unsigned long offset);
 
 /* tcp_stats.c */
 DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index c397524c039..af95c8e058f 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -43,9 +43,9 @@ void rds_tcp_state_change(struct sock *sk)
 	struct rds_connection *conn;
 	struct rds_tcp_connection *tc;
 
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 	conn = sk->sk_user_data;
-	if (conn == NULL) {
+	if (!conn) {
 		state_change = sk->sk_state_change;
 		goto out;
 	}
@@ -68,7 +68,7 @@ void rds_tcp_state_change(struct sock *sk)
 			break;
 	}
 out:
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 	state_change(sk);
 }
 
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 975183fe695..8b5cc4aa886 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -114,9 +114,9 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
 
 	rdsdebug("listen data ready sk %p\n", sk);
 
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 	ready = sk->sk_user_data;
-	if (ready == NULL) { /* check for teardown race */
+	if (!ready) { /* check for teardown race */
 		ready = sk->sk_data_ready;
 		goto out;
 	}
@@ -131,11 +131,11 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
 		queue_work(rds_wq, &rds_tcp_listen_work);
 
 out:
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 	ready(sk, bytes);
 }
 
-int __init rds_tcp_listen_init(void)
+int rds_tcp_listen_init(void)
 {
 	struct sockaddr_in sin;
 	struct socket *sock = NULL;
@@ -178,7 +178,7 @@ void rds_tcp_listen_stop(void)
 	struct socket *sock = rds_tcp_listen_sock;
 	struct sock *sk;
 
-	if (sock == NULL)
+	if (!sock)
 		return;
 
 	sk = sock->sk;
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index 1aba6878fa5..78205e25500 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -39,7 +39,7 @@
 
 static struct kmem_cache *rds_tcp_incoming_slab;
 
-void rds_tcp_inc_purge(struct rds_incoming *inc)
+static void rds_tcp_inc_purge(struct rds_incoming *inc)
 {
 	struct rds_tcp_incoming *tinc;
 	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
@@ -190,10 +190,10 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
 	 * processing.
 	 */
 	while (left) {
-		if (tinc == NULL) {
+		if (!tinc) {
 			tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
 					        arg->gfp);
-			if (tinc == NULL) {
+			if (!tinc) {
 				desc->error = -ENOMEM;
 				goto out;
 			}
@@ -229,7 +229,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
 
 		if (left && tc->t_tinc_data_rem) {
 			clone = skb_clone(skb, arg->gfp);
-			if (clone == NULL) {
+			if (!clone) {
 				desc->error = -ENOMEM;
 				goto out;
 			}
@@ -272,7 +272,8 @@ out:
 }
 
 /* the caller has to hold the sock lock */
-int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km)
+static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp,
+			     enum km_type km)
 {
 	struct rds_tcp_connection *tc = conn->c_transport_data;
 	struct socket *sock = tc->t_sock;
@@ -324,9 +325,9 @@ void rds_tcp_data_ready(struct sock *sk, int bytes)
 
 	rdsdebug("data ready sk %p bytes %d\n", sk, bytes);
 
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 	conn = sk->sk_user_data;
-	if (conn == NULL) { /* check for teardown race */
+	if (!conn) { /* check for teardown race */
 		ready = sk->sk_data_ready;
 		goto out;
 	}
@@ -338,16 +339,16 @@ void rds_tcp_data_ready(struct sock *sk, int bytes)
 	if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM)
 		queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
 out:
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 	ready(sk, bytes);
 }
 
-int __init rds_tcp_recv_init(void)
+int rds_tcp_recv_init(void)
 {
 	rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
 					sizeof(struct rds_tcp_incoming),
 					0, 0, NULL);
-	if (rds_tcp_incoming_slab == NULL)
+	if (!rds_tcp_incoming_slab)
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index a28b895ff0d..1b4fd68f0c7 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -63,7 +63,7 @@ void rds_tcp_xmit_complete(struct rds_connection *conn)
 }
 
 /* the core send_sem serializes this with other xmit and shutdown */
-int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
+static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
 {
 	struct kvec vec = {
                 .iov_base = data,
@@ -77,56 +77,6 @@ int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
 }
 
 /* the core send_sem serializes this with other xmit and shutdown */
-int rds_tcp_xmit_cong_map(struct rds_connection *conn,
-			  struct rds_cong_map *map, unsigned long offset)
-{
-	static struct rds_header rds_tcp_map_header = {
-		.h_flags = RDS_FLAG_CONG_BITMAP,
-	};
-	struct rds_tcp_connection *tc = conn->c_transport_data;
-	unsigned long i;
-	int ret;
-	int copied = 0;
-
-	/* Some problem claims cpu_to_be32(constant) isn't a constant. */
-	rds_tcp_map_header.h_len = cpu_to_be32(RDS_CONG_MAP_BYTES);
-
-	if (offset < sizeof(struct rds_header)) {
-		ret = rds_tcp_sendmsg(tc->t_sock,
-				      (void *)&rds_tcp_map_header + offset,
-				      sizeof(struct rds_header) - offset);
-		if (ret <= 0)
-			return ret;
-		offset += ret;
-		copied = ret;
-		if (offset < sizeof(struct rds_header))
-			return ret;
-	}
-
-	offset -= sizeof(struct rds_header);
-	i = offset / PAGE_SIZE;
-	offset = offset % PAGE_SIZE;
-	BUG_ON(i >= RDS_CONG_MAP_PAGES);
-
-	do {
-		ret = tc->t_sock->ops->sendpage(tc->t_sock,
-					virt_to_page(map->m_page_addrs[i]),
-					offset, PAGE_SIZE - offset,
-					MSG_DONTWAIT);
-		if (ret <= 0)
-			break;
-		copied += ret;
-		offset += ret;
-		if (offset == PAGE_SIZE) {
-			offset = 0;
-			i++;
-		}
-	} while (i < RDS_CONG_MAP_PAGES);
-
-        return copied ? copied : ret;
-}
-
-/* the core send_sem serializes this with other xmit and shutdown */
 int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
 	         unsigned int hdr_off, unsigned int sg, unsigned int off)
 {
@@ -166,21 +116,21 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
 			goto out;
 	}
 
-	while (sg < rm->m_nents) {
+	while (sg < rm->data.op_nents) {
 		ret = tc->t_sock->ops->sendpage(tc->t_sock,
-						sg_page(&rm->m_sg[sg]),
-						rm->m_sg[sg].offset + off,
-						rm->m_sg[sg].length - off,
+						sg_page(&rm->data.op_sg[sg]),
+						rm->data.op_sg[sg].offset + off,
+						rm->data.op_sg[sg].length - off,
 						MSG_DONTWAIT|MSG_NOSIGNAL);
-		rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->m_sg[sg]),
-			 rm->m_sg[sg].offset + off, rm->m_sg[sg].length - off,
+		rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
+			 rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
 			 ret);
 		if (ret <= 0)
 			break;
 
 		off += ret;
 		done += ret;
-		if (off == rm->m_sg[sg].length) {
+		if (off == rm->data.op_sg[sg].length) {
 			off = 0;
 			sg++;
 		}
@@ -224,9 +174,9 @@ void rds_tcp_write_space(struct sock *sk)
 	struct rds_connection *conn;
 	struct rds_tcp_connection *tc;
 
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 	conn = sk->sk_user_data;
-	if (conn == NULL) {
+	if (!conn) {
 		write_space = sk->sk_write_space;
 		goto out;
 	}
@@ -244,7 +194,7 @@ void rds_tcp_write_space(struct sock *sk)
 		queue_delayed_work(rds_wq, &conn->c_send_w, 0);
 
 out:
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 
 	/*
 	 * write_space is only called when data leaves tcp's send queue if
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 786c20eaaf5..0fd90f8c5f5 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -61,7 +61,7 @@
  *
  * Transition to state DISCONNECTING/DOWN:
  *  -	Inside the shutdown worker; synchronizes with xmit path
- *	through c_send_lock, and with connection management callbacks
+ *	through RDS_IN_XMIT, and with connection management callbacks
  *	via c_cm_lock.
  *
  *	For receive callbacks, we rely on the underlying transport
@@ -110,7 +110,7 @@ EXPORT_SYMBOL_GPL(rds_connect_complete);
  * We should *always* start with a random backoff; otherwise a broken connection
  * will always take several iterations to be re-established.
  */
-static void rds_queue_reconnect(struct rds_connection *conn)
+void rds_queue_reconnect(struct rds_connection *conn)
 {
 	unsigned long rand;
 
@@ -156,58 +156,6 @@ void rds_connect_worker(struct work_struct *work)
 	}
 }
 
-void rds_shutdown_worker(struct work_struct *work)
-{
-	struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
-
-	/* shut it down unless it's down already */
-	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
-		/*
-		 * Quiesce the connection mgmt handlers before we start tearing
-		 * things down. We don't hold the mutex for the entire
-		 * duration of the shutdown operation, else we may be
-		 * deadlocking with the CM handler. Instead, the CM event
-		 * handler is supposed to check for state DISCONNECTING
-		 */
-		mutex_lock(&conn->c_cm_lock);
-		if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) &&
-		    !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
-			rds_conn_error(conn, "shutdown called in state %d\n",
-					atomic_read(&conn->c_state));
-			mutex_unlock(&conn->c_cm_lock);
-			return;
-		}
-		mutex_unlock(&conn->c_cm_lock);
-
-		mutex_lock(&conn->c_send_lock);
-		conn->c_trans->conn_shutdown(conn);
-		rds_conn_reset(conn);
-		mutex_unlock(&conn->c_send_lock);
-
-		if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
-			/* This can happen - eg when we're in the middle of tearing
-			 * down the connection, and someone unloads the rds module.
-			 * Quite reproduceable with loopback connections.
-			 * Mostly harmless.
-			 */
-			rds_conn_error(conn,
-				"%s: failed to transition to state DOWN, "
-				"current state is %d\n",
-				__func__,
-				atomic_read(&conn->c_state));
-			return;
-		}
-	}
-
-	/* Then reconnect if it's still live.
-	 * The passive side of an IB loopback connection is never added
-	 * to the conn hash, so we never trigger a reconnect on this
-	 * conn - the reconnect is always triggered by the active peer. */
-	cancel_delayed_work(&conn->c_conn_w);
-	if (!hlist_unhashed(&conn->c_hash_node))
-		rds_queue_reconnect(conn);
-}
-
 void rds_send_worker(struct work_struct *work)
 {
 	struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
@@ -252,15 +200,22 @@ void rds_recv_worker(struct work_struct *work)
 	}
 }
 
+void rds_shutdown_worker(struct work_struct *work)
+{
+	struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
+
+	rds_conn_shutdown(conn);
+}
+
 void rds_threads_exit(void)
 {
 	destroy_workqueue(rds_wq);
 }
 
-int __init rds_threads_init(void)
+int rds_threads_init(void)
 {
-	rds_wq = create_workqueue("krdsd");
-	if (rds_wq == NULL)
+	rds_wq = create_singlethread_workqueue("krdsd");
+	if (!rds_wq)
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 7e106790135..7f2ac4fec36 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -71,19 +71,28 @@ void rds_trans_unregister(struct rds_transport *trans)
 }
 EXPORT_SYMBOL_GPL(rds_trans_unregister);
 
+void rds_trans_put(struct rds_transport *trans)
+{
+	if (trans && trans->t_owner)
+		module_put(trans->t_owner);
+}
+
 struct rds_transport *rds_trans_get_preferred(__be32 addr)
 {
 	struct rds_transport *ret = NULL;
-	int i;
+	struct rds_transport *trans;
+	unsigned int i;
 
 	if (IN_LOOPBACK(ntohl(addr)))
 		return &rds_loop_transport;
 
 	down_read(&rds_trans_sem);
-	for (i = 0; i < RDS_TRANS_COUNT; i++)
-	{
-		if (transports[i] && (transports[i]->laddr_check(addr) == 0)) {
-			ret = transports[i];
+	for (i = 0; i < RDS_TRANS_COUNT; i++) {
+		trans = transports[i];
+
+		if (trans && (trans->laddr_check(addr) == 0) &&
+		    (!trans->t_owner || try_module_get(trans->t_owner))) {
+			ret = trans;
 			break;
 		}
 	}
diff --git a/net/rds/xlist.h b/net/rds/xlist.h
new file mode 100644
index 00000000000..e6b5190dadd
--- /dev/null
+++ b/net/rds/xlist.h
@@ -0,0 +1,80 @@
+#ifndef _LINUX_XLIST_H
+#define _LINUX_XLIST_H
+
+#include <linux/stddef.h>
+#include <linux/poison.h>
+#include <linux/prefetch.h>
+#include <asm/system.h>
+
+struct xlist_head {
+	struct xlist_head *next;
+};
+
+static inline void INIT_XLIST_HEAD(struct xlist_head *list)
+{
+	list->next = NULL;
+}
+
+static inline int xlist_empty(struct xlist_head *head)
+{
+	return head->next == NULL;
+}
+
+static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail,
+			     struct xlist_head *head)
+{
+	struct xlist_head *cur;
+	struct xlist_head *check;
+
+	while (1) {
+		cur = head->next;
+		tail->next = cur;
+		check = cmpxchg(&head->next, cur, new);
+		if (check == cur)
+			break;
+	}
+}
+
+static inline struct xlist_head *xlist_del_head(struct xlist_head *head)
+{
+	struct xlist_head *cur;
+	struct xlist_head *check;
+	struct xlist_head *next;
+
+	while (1) {
+		cur = head->next;
+		if (!cur)
+			goto out;
+
+		next = cur->next;
+		check = cmpxchg(&head->next, cur, next);
+		if (check == cur)
+			goto out;
+	}
+out:
+	return cur;
+}
+
+static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head)
+{
+	struct xlist_head *cur;
+
+	cur = head->next;
+	if (!cur)
+		return NULL;
+
+	head->next = cur->next;
+	return cur;
+}
+
+static inline void xlist_splice(struct xlist_head *list,
+				struct xlist_head *head)
+{
+	struct xlist_head *cur;
+
+	WARN_ON(head->next);
+	cur = xchg(&list->next, NULL);
+	head->next = cur;
+}
+
+#endif
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 51875a0c5d4..04f599089e6 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -1241,6 +1241,7 @@ static const struct file_operations rfkill_fops = {
 	.unlocked_ioctl	= rfkill_fop_ioctl,
 	.compat_ioctl	= rfkill_fop_ioctl,
 #endif
+	.llseek		= no_llseek,
 };
 
 static struct miscdevice rfkill_miscdev = {
diff --git a/net/rfkill/input.c b/net/rfkill/input.c
index 3713d7ecab9..1bca6d49ec9 100644
--- a/net/rfkill/input.c
+++ b/net/rfkill/input.c
@@ -142,7 +142,7 @@ static unsigned long rfkill_last_scheduled;
 static unsigned long rfkill_ratelimit(const unsigned long last)
 {
 	const unsigned long delay = msecs_to_jiffies(RFKILL_OPS_DELAY);
-	return (time_after(jiffies, last + delay)) ? 0 : delay;
+	return time_after(jiffies, last + delay) ? 0 : delay;
 }
 
 static void rfkill_schedule_ratelimited(void)
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 8e45e76a95f..d952e7eac18 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -679,7 +679,7 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1)
 		return -EINVAL;
 
-	if (addr->srose_ndigis > ROSE_MAX_DIGIS)
+	if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS)
 		return -EINVAL;
 
 	if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) {
@@ -739,7 +739,7 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le
 	if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1)
 		return -EINVAL;
 
-	if (addr->srose_ndigis > ROSE_MAX_DIGIS)
+	if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS)
 		return -EINVAL;
 
 	/* Source + Destination digis should not exceed ROSE_MAX_DIGIS */
diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c
index a750a28e022..fa5f5641a2c 100644
--- a/net/rose/rose_link.c
+++ b/net/rose/rose_link.c
@@ -114,7 +114,7 @@ static int rose_send_frame(struct sk_buff *skb, struct rose_neigh *neigh)
 	if (ax25s)
 		ax25_cb_put(ax25s);
 
-	return (neigh->ax25 != NULL);
+	return neigh->ax25 != NULL;
 }
 
 /*
@@ -137,7 +137,7 @@ static int rose_link_up(struct rose_neigh *neigh)
 	if (ax25s)
 		ax25_cb_put(ax25s);
 
-	return (neigh->ax25 != NULL);
+	return neigh->ax25 != NULL;
 }
 
 /*
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index cbc244a128b..b4fdaac233f 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -109,7 +109,9 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route,
 		init_timer(&rose_neigh->t0timer);
 
 		if (rose_route->ndigis != 0) {
-			if ((rose_neigh->digipeat = kmalloc(sizeof(ax25_digi), GFP_KERNEL)) == NULL) {
+			rose_neigh->digipeat =
+				kmalloc(sizeof(ax25_digi), GFP_ATOMIC);
+			if (rose_neigh->digipeat == NULL) {
 				kfree(rose_neigh);
 				res = -ENOMEM;
 				goto out;
diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c
index 2714da167fb..b6ffe4e1b84 100644
--- a/net/rxrpc/ar-ack.c
+++ b/net/rxrpc/ar-ack.c
@@ -245,6 +245,9 @@ static void rxrpc_resend_timer(struct rxrpc_call *call)
 	_enter("%d,%d,%d",
 	       call->acks_tail, call->acks_unacked, call->acks_head);
 
+	if (call->state >= RXRPC_CALL_COMPLETE)
+		return;
+
 	resend = 0;
 	resend_at = 0;
 
diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c
index 909d092de9f..bf656c230ba 100644
--- a/net/rxrpc/ar-call.c
+++ b/net/rxrpc/ar-call.c
@@ -786,6 +786,7 @@ static void rxrpc_call_life_expired(unsigned long _call)
 
 /*
  * handle resend timer expiry
+ * - may not take call->state_lock as this can deadlock against del_timer_sync()
  */
 static void rxrpc_resend_time_expired(unsigned long _call)
 {
@@ -796,12 +797,9 @@ static void rxrpc_resend_time_expired(unsigned long _call)
 	if (call->state >= RXRPC_CALL_COMPLETE)
 		return;
 
-	read_lock_bh(&call->state_lock);
 	clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
-	if (call->state < RXRPC_CALL_COMPLETE &&
-	    !test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events))
+	if (!test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events))
 		rxrpc_queue_call(call);
-	read_unlock_bh(&call->state_lock);
 }
 
 /*
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 7043b294bb6..8e22bd345e7 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -597,12 +597,6 @@ extern unsigned rxrpc_debug;
 #define dbgprintk(FMT,...) \
 	printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
 
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline __attribute__((format(printf,1,2)))
-void _dbprintk(const char *fmt, ...)
-{
-}
-
 #define kenter(FMT,...)	dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
 #define kleave(FMT,...)	dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
 #define kdebug(FMT,...)	dbgprintk("    "FMT ,##__VA_ARGS__)
@@ -655,11 +649,11 @@ do {							\
 } while (0)
 
 #else
-#define _enter(FMT,...)	_dbprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
-#define _leave(FMT,...)	_dbprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
-#define _debug(FMT,...)	_dbprintk("    "FMT ,##__VA_ARGS__)
-#define _proto(FMT,...)	_dbprintk("### "FMT ,##__VA_ARGS__)
-#define _net(FMT,...)	_dbprintk("@@@ "FMT ,##__VA_ARGS__)
+#define _enter(FMT,...)	no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define _leave(FMT,...)	no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
+#define _debug(FMT,...)	no_printk("    "FMT ,##__VA_ARGS__)
+#define _proto(FMT,...)	no_printk("### "FMT ,##__VA_ARGS__)
+#define _net(FMT,...)	no_printk("@@@ "FMT ,##__VA_ARGS__)
 #endif
 
 /*
diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c
index f0f85b0123f..9f1729bd60d 100644
--- a/net/rxrpc/ar-peer.c
+++ b/net/rxrpc/ar-peer.c
@@ -64,8 +64,8 @@ static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer)
 		return;
 	}
 
-	peer->if_mtu = dst_mtu(&rt->u.dst);
-	dst_release(&rt->u.dst);
+	peer->if_mtu = dst_mtu(&rt->dst);
+	dst_release(&rt->dst);
 
 	_leave(" [if_mtu %u]", peer->if_mtu);
 }
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 2f691fb180d..a36270a994d 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -518,6 +518,16 @@ config NET_ACT_SKBEDIT
 	  To compile this code as a module, choose M here: the
 	  module will be called act_skbedit.
 
+config NET_ACT_CSUM
+        tristate "Checksum Updating"
+        depends on NET_CLS_ACT && INET
+        ---help---
+	  Say Y here to update some common checksum after some direct
+	  packet alterations.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_csum.
+
 config NET_CLS_IND
 	bool "Incoming device classification"
 	depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index f14e71bfa58..960f5dba630 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_NET_ACT_NAT)	+= act_nat.o
 obj-$(CONFIG_NET_ACT_PEDIT)	+= act_pedit.o
 obj-$(CONFIG_NET_ACT_SIMP)	+= act_simple.o
 obj-$(CONFIG_NET_ACT_SKBEDIT)	+= act_skbedit.o
+obj-$(CONFIG_NET_ACT_CSUM)	+= act_csum.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 972378f47f3..23b25f89e7e 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -26,6 +26,11 @@
 #include <net/act_api.h>
 #include <net/netlink.h>
 
+static void tcf_common_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct tcf_common, tcfc_rcu));
+}
+
 void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
 {
 	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
@@ -38,7 +43,11 @@ void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
 			write_unlock_bh(hinfo->lock);
 			gen_kill_estimator(&p->tcfc_bstats,
 					   &p->tcfc_rate_est);
-			kfree(p);
+			/*
+			 * gen_estimator est_timer() might access p->tcfc_lock
+			 * or bstats, wait a RCU grace period before freeing p
+			 */
+			call_rcu(&p->tcfc_rcu, tcf_common_free_rcu);
 			return;
 		}
 	}
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
new file mode 100644
index 00000000000..67dc7ce9b63
--- /dev/null
+++ b/net/sched/act_csum.c
@@ -0,0 +1,595 @@
+/*
+ * Checksum updating actions
+ *
+ * Copyright (c) 2010 Gregoire Baron <baronchon@n7mm.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+
+#include <linux/netlink.h>
+#include <net/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <linux/skbuff.h>
+
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/igmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/ip6_checksum.h>
+
+#include <net/act_api.h>
+
+#include <linux/tc_act/tc_csum.h>
+#include <net/tc_act/tc_csum.h>
+
+#define CSUM_TAB_MASK 15
+static struct tcf_common *tcf_csum_ht[CSUM_TAB_MASK + 1];
+static u32 csum_idx_gen;
+static DEFINE_RWLOCK(csum_lock);
+
+static struct tcf_hashinfo csum_hash_info = {
+	.htab	= tcf_csum_ht,
+	.hmask	= CSUM_TAB_MASK,
+	.lock	= &csum_lock,
+};
+
+static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
+	[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
+};
+
+static int tcf_csum_init(struct nlattr *nla, struct nlattr *est,
+			 struct tc_action *a, int ovr, int bind)
+{
+	struct nlattr *tb[TCA_CSUM_MAX + 1];
+	struct tc_csum *parm;
+	struct tcf_common *pc;
+	struct tcf_csum *p;
+	int ret = 0, err;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_CSUM_MAX, nla,csum_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_CSUM_PARMS] == NULL)
+		return -EINVAL;
+	parm = nla_data(tb[TCA_CSUM_PARMS]);
+
+	pc = tcf_hash_check(parm->index, a, bind, &csum_hash_info);
+	if (!pc) {
+		pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
+				     &csum_idx_gen, &csum_hash_info);
+		if (IS_ERR(pc))
+			return PTR_ERR(pc);
+		p = to_tcf_csum(pc);
+		ret = ACT_P_CREATED;
+	} else {
+		p = to_tcf_csum(pc);
+		if (!ovr) {
+			tcf_hash_release(pc, bind, &csum_hash_info);
+			return -EEXIST;
+		}
+	}
+
+	spin_lock_bh(&p->tcf_lock);
+	p->tcf_action = parm->action;
+	p->update_flags = parm->update_flags;
+	spin_unlock_bh(&p->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(pc, &csum_hash_info);
+
+	return ret;
+}
+
+static int tcf_csum_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_csum *p = a->priv;
+	return tcf_hash_release(&p->common, bind, &csum_hash_info);
+}
+
+/**
+ * tcf_csum_skb_nextlayer - Get next layer pointer
+ * @skb: sk_buff to use
+ * @ihl: previous summed headers length
+ * @ipl: complete packet length
+ * @jhl: next header length
+ *
+ * Check the expected next layer availability in the specified sk_buff.
+ * Return the next layer pointer if pass, NULL otherwise.
+ */
+static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
+				    unsigned int ihl, unsigned int ipl,
+				    unsigned int jhl)
+{
+	int ntkoff = skb_network_offset(skb);
+	int hl = ihl + jhl;
+
+	if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) ||
+	    (skb_cloned(skb) &&
+	     !skb_clone_writable(skb, hl + ntkoff) &&
+	     pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		return NULL;
+	else
+		return (void *)(skb_network_header(skb) + ihl);
+}
+
+static int tcf_csum_ipv4_icmp(struct sk_buff *skb,
+			      unsigned int ihl, unsigned int ipl)
+{
+	struct icmphdr *icmph;
+
+	icmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmph));
+	if (icmph == NULL)
+		return 0;
+
+	icmph->checksum = 0;
+	skb->csum = csum_partial(icmph, ipl - ihl, 0);
+	icmph->checksum = csum_fold(skb->csum);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
+static int tcf_csum_ipv4_igmp(struct sk_buff *skb,
+			      unsigned int ihl, unsigned int ipl)
+{
+	struct igmphdr *igmph;
+
+	igmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*igmph));
+	if (igmph == NULL)
+		return 0;
+
+	igmph->csum = 0;
+	skb->csum = csum_partial(igmph, ipl - ihl, 0);
+	igmph->csum = csum_fold(skb->csum);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
+static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h,
+			      unsigned int ihl, unsigned int ipl)
+{
+	struct icmp6hdr *icmp6h;
+
+	icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h));
+	if (icmp6h == NULL)
+		return 0;
+
+	icmp6h->icmp6_cksum = 0;
+	skb->csum = csum_partial(icmp6h, ipl - ihl, 0);
+	icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+					      ipl - ihl, IPPROTO_ICMPV6,
+					      skb->csum);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
+static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph,
+			     unsigned int ihl, unsigned int ipl)
+{
+	struct tcphdr *tcph;
+
+	tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
+	if (tcph == NULL)
+		return 0;
+
+	tcph->check = 0;
+	skb->csum = csum_partial(tcph, ipl - ihl, 0);
+	tcph->check = tcp_v4_check(ipl - ihl,
+				   iph->saddr, iph->daddr, skb->csum);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
+static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h,
+			     unsigned int ihl, unsigned int ipl)
+{
+	struct tcphdr *tcph;
+
+	tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
+	if (tcph == NULL)
+		return 0;
+
+	tcph->check = 0;
+	skb->csum = csum_partial(tcph, ipl - ihl, 0);
+	tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+				      ipl - ihl, IPPROTO_TCP,
+				      skb->csum);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return 1;
+}
+
+static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph,
+			     unsigned int ihl, unsigned int ipl, int udplite)
+{
+	struct udphdr *udph;
+	u16 ul;
+
+	/*
+	 * Support both UDP and UDPLITE checksum algorithms, Don't use
+	 * udph->len to get the real length without any protocol check,
+	 * UDPLITE uses udph->len for another thing,
+	 * Use iph->tot_len, or just ipl.
+	 */
+
+	udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
+	if (udph == NULL)
+		return 0;
+
+	ul = ntohs(udph->len);
+
+	if (udplite || udph->check) {
+
+		udph->check = 0;
+
+		if (udplite) {
+			if (ul == 0)
+				skb->csum = csum_partial(udph, ipl - ihl, 0);
+			else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
+				skb->csum = csum_partial(udph, ul, 0);
+			else
+				goto ignore_obscure_skb;
+		} else {
+			if (ul != ipl - ihl)
+				goto ignore_obscure_skb;
+
+			skb->csum = csum_partial(udph, ul, 0);
+		}
+
+		udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+						ul, iph->protocol,
+						skb->csum);
+
+		if (!udph->check)
+			udph->check = CSUM_MANGLED_0;
+	}
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+ignore_obscure_skb:
+	return 1;
+}
+
+static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h,
+			     unsigned int ihl, unsigned int ipl, int udplite)
+{
+	struct udphdr *udph;
+	u16 ul;
+
+	/*
+	 * Support both UDP and UDPLITE checksum algorithms, Don't use
+	 * udph->len to get the real length without any protocol check,
+	 * UDPLITE uses udph->len for another thing,
+	 * Use ip6h->payload_len + sizeof(*ip6h) ... , or just ipl.
+	 */
+
+	udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
+	if (udph == NULL)
+		return 0;
+
+	ul = ntohs(udph->len);
+
+	udph->check = 0;
+
+	if (udplite) {
+		if (ul == 0)
+			skb->csum = csum_partial(udph, ipl - ihl, 0);
+
+		else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
+			skb->csum = csum_partial(udph, ul, 0);
+
+		else
+			goto ignore_obscure_skb;
+	} else {
+		if (ul != ipl - ihl)
+			goto ignore_obscure_skb;
+
+		skb->csum = csum_partial(udph, ul, 0);
+	}
+
+	udph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ul,
+				      udplite ? IPPROTO_UDPLITE : IPPROTO_UDP,
+				      skb->csum);
+
+	if (!udph->check)
+		udph->check = CSUM_MANGLED_0;
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+ignore_obscure_skb:
+	return 1;
+}
+
+static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
+{
+	struct iphdr *iph;
+	int ntkoff;
+
+	ntkoff = skb_network_offset(skb);
+
+	if (!pskb_may_pull(skb, sizeof(*iph) + ntkoff))
+		goto fail;
+
+	iph = ip_hdr(skb);
+
+	switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
+	case IPPROTO_ICMP:
+		if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
+			if (!tcf_csum_ipv4_icmp(skb, iph->ihl * 4,
+						ntohs(iph->tot_len)))
+				goto fail;
+		break;
+	case IPPROTO_IGMP:
+		if (update_flags & TCA_CSUM_UPDATE_FLAG_IGMP)
+			if (!tcf_csum_ipv4_igmp(skb, iph->ihl * 4,
+						ntohs(iph->tot_len)))
+				goto fail;
+		break;
+	case IPPROTO_TCP:
+		if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
+			if (!tcf_csum_ipv4_tcp(skb, iph, iph->ihl * 4,
+					       ntohs(iph->tot_len)))
+				goto fail;
+		break;
+	case IPPROTO_UDP:
+		if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
+			if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4,
+					       ntohs(iph->tot_len), 0))
+				goto fail;
+		break;
+	case IPPROTO_UDPLITE:
+		if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
+			if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4,
+					       ntohs(iph->tot_len), 1))
+				goto fail;
+		break;
+	}
+
+	if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
+		if (skb_cloned(skb) &&
+		    !skb_clone_writable(skb, sizeof(*iph) + ntkoff) &&
+		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+			goto fail;
+
+		ip_send_check(iph);
+	}
+
+	return 1;
+
+fail:
+	return 0;
+}
+
+static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh,
+				 unsigned int ixhl, unsigned int *pl)
+{
+	int off, len, optlen;
+	unsigned char *xh = (void *)ip6xh;
+
+	off = sizeof(*ip6xh);
+	len = ixhl - off;
+
+	while (len > 1) {
+		switch (xh[off]) {
+		case IPV6_TLV_PAD0:
+			optlen = 1;
+			break;
+		case IPV6_TLV_JUMBO:
+			optlen = xh[off + 1] + 2;
+			if (optlen != 6 || len < 6 || (off & 3) != 2)
+				/* wrong jumbo option length/alignment */
+				return 0;
+			*pl = ntohl(*(__be32 *)(xh + off + 2));
+			goto done;
+		default:
+			optlen = xh[off + 1] + 2;
+			if (optlen > len)
+				/* ignore obscure options */
+				goto done;
+			break;
+		}
+		off += optlen;
+		len -= optlen;
+	}
+
+done:
+	return 1;
+}
+
+static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
+{
+	struct ipv6hdr *ip6h;
+	struct ipv6_opt_hdr *ip6xh;
+	unsigned int hl, ixhl;
+	unsigned int pl;
+	int ntkoff;
+	u8 nexthdr;
+
+	ntkoff = skb_network_offset(skb);
+
+	hl = sizeof(*ip6h);
+
+	if (!pskb_may_pull(skb, hl + ntkoff))
+		goto fail;
+
+	ip6h = ipv6_hdr(skb);
+
+	pl = ntohs(ip6h->payload_len);
+	nexthdr = ip6h->nexthdr;
+
+	do {
+		switch (nexthdr) {
+		case NEXTHDR_FRAGMENT:
+			goto ignore_skb;
+		case NEXTHDR_ROUTING:
+		case NEXTHDR_HOP:
+		case NEXTHDR_DEST:
+			if (!pskb_may_pull(skb, hl + sizeof(*ip6xh) + ntkoff))
+				goto fail;
+			ip6xh = (void *)(skb_network_header(skb) + hl);
+			ixhl = ipv6_optlen(ip6xh);
+			if (!pskb_may_pull(skb, hl + ixhl + ntkoff))
+				goto fail;
+			if ((nexthdr == NEXTHDR_HOP) &&
+			    !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl)))
+				goto fail;
+			nexthdr = ip6xh->nexthdr;
+			hl += ixhl;
+			break;
+		case IPPROTO_ICMPV6:
+			if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
+				if (!tcf_csum_ipv6_icmp(skb, ip6h,
+							hl, pl + sizeof(*ip6h)))
+					goto fail;
+			goto done;
+		case IPPROTO_TCP:
+			if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
+				if (!tcf_csum_ipv6_tcp(skb, ip6h,
+						       hl, pl + sizeof(*ip6h)))
+					goto fail;
+			goto done;
+		case IPPROTO_UDP:
+			if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
+				if (!tcf_csum_ipv6_udp(skb, ip6h, hl,
+						       pl + sizeof(*ip6h), 0))
+					goto fail;
+			goto done;
+		case IPPROTO_UDPLITE:
+			if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
+				if (!tcf_csum_ipv6_udp(skb, ip6h, hl,
+						       pl + sizeof(*ip6h), 1))
+					goto fail;
+			goto done;
+		default:
+			goto ignore_skb;
+		}
+	} while (pskb_may_pull(skb, hl + 1 + ntkoff));
+
+done:
+ignore_skb:
+	return 1;
+
+fail:
+	return 0;
+}
+
+static int tcf_csum(struct sk_buff *skb,
+		    struct tc_action *a, struct tcf_result *res)
+{
+	struct tcf_csum *p = a->priv;
+	int action;
+	u32 update_flags;
+
+	spin_lock(&p->tcf_lock);
+	p->tcf_tm.lastuse = jiffies;
+	p->tcf_bstats.bytes += qdisc_pkt_len(skb);
+	p->tcf_bstats.packets++;
+	action = p->tcf_action;
+	update_flags = p->update_flags;
+	spin_unlock(&p->tcf_lock);
+
+	if (unlikely(action == TC_ACT_SHOT))
+		goto drop;
+
+	switch (skb->protocol) {
+	case cpu_to_be16(ETH_P_IP):
+		if (!tcf_csum_ipv4(skb, update_flags))
+			goto drop;
+		break;
+	case cpu_to_be16(ETH_P_IPV6):
+		if (!tcf_csum_ipv6(skb, update_flags))
+			goto drop;
+		break;
+	}
+
+	return action;
+
+drop:
+	spin_lock(&p->tcf_lock);
+	p->tcf_qstats.drops++;
+	spin_unlock(&p->tcf_lock);
+	return TC_ACT_SHOT;
+}
+
+static int tcf_csum_dump(struct sk_buff *skb,
+			 struct tc_action *a, int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_csum *p = a->priv;
+	struct tc_csum opt = {
+		.update_flags = p->update_flags,
+		.index   = p->tcf_index,
+		.action  = p->tcf_action,
+		.refcnt  = p->tcf_refcnt - ref,
+		.bindcnt = p->tcf_bindcnt - bind,
+	};
+	struct tcf_t t;
+
+	NLA_PUT(skb, TCA_CSUM_PARMS, sizeof(opt), &opt);
+	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
+	NLA_PUT(skb, TCA_CSUM_TM, sizeof(t), &t);
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tc_action_ops act_csum_ops = {
+	.kind		= "csum",
+	.hinfo		= &csum_hash_info,
+	.type		= TCA_ACT_CSUM,
+	.capab		= TCA_CAP_NONE,
+	.owner		= THIS_MODULE,
+	.act		= tcf_csum,
+	.dump		= tcf_csum_dump,
+	.cleanup	= tcf_csum_cleanup,
+	.lookup		= tcf_hash_search,
+	.init		= tcf_csum_init,
+	.walk		= tcf_generic_walker
+};
+
+MODULE_DESCRIPTION("Checksum updating actions");
+MODULE_LICENSE("GPL");
+
+static int __init csum_init_module(void)
+{
+	return tcf_register_action(&act_csum_ops);
+}
+
+static void __exit csum_cleanup_module(void)
+{
+	tcf_unregister_action(&act_csum_ops);
+}
+
+module_init(csum_init_module);
+module_exit(csum_cleanup_module);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 8406c665499..c2ed90a4c0b 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -152,21 +152,24 @@ static int tcf_gact(struct sk_buff *skb, struct tc_action *a, struct tcf_result
 static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tc_gact opt;
 	struct tcf_gact *gact = a->priv;
+	struct tc_gact opt = {
+		.index   = gact->tcf_index,
+		.refcnt  = gact->tcf_refcnt - ref,
+		.bindcnt = gact->tcf_bindcnt - bind,
+		.action  = gact->tcf_action,
+	};
 	struct tcf_t t;
 
-	opt.index = gact->tcf_index;
-	opt.refcnt = gact->tcf_refcnt - ref;
-	opt.bindcnt = gact->tcf_bindcnt - bind;
-	opt.action = gact->tcf_action;
 	NLA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt);
 #ifdef CONFIG_GACT_PROB
 	if (gact->tcfg_ptype) {
-		struct tc_gact_p p_opt;
-		p_opt.paction = gact->tcfg_paction;
-		p_opt.pval = gact->tcfg_pval;
-		p_opt.ptype = gact->tcfg_ptype;
+		struct tc_gact_p p_opt = {
+			.paction = gact->tcfg_paction,
+			.pval    = gact->tcfg_pval,
+			.ptype   = gact->tcfg_ptype,
+		};
+
 		NLA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt);
 	}
 #endif
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index c7e59e6ec34..8daef963225 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -39,7 +39,7 @@ static struct tcf_hashinfo ipt_hash_info = {
 	.lock	=	&ipt_lock,
 };
 
-static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook)
+static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
 {
 	struct xt_tgchk_param par;
 	struct xt_target *target;
@@ -66,7 +66,7 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int
 	return 0;
 }
 
-static void ipt_destroy_target(struct ipt_entry_target *t)
+static void ipt_destroy_target(struct xt_entry_target *t)
 {
 	struct xt_tgdtor_param par = {
 		.target   = t->u.kernel.target,
@@ -99,7 +99,7 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
 	[TCA_IPT_TABLE]	= { .type = NLA_STRING, .len = IFNAMSIZ },
 	[TCA_IPT_HOOK]	= { .type = NLA_U32 },
 	[TCA_IPT_INDEX]	= { .type = NLA_U32 },
-	[TCA_IPT_TARG]	= { .len = sizeof(struct ipt_entry_target) },
+	[TCA_IPT_TARG]	= { .len = sizeof(struct xt_entry_target) },
 };
 
 static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
@@ -108,7 +108,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
 	struct nlattr *tb[TCA_IPT_MAX + 1];
 	struct tcf_ipt *ipt;
 	struct tcf_common *pc;
-	struct ipt_entry_target *td, *t;
+	struct xt_entry_target *td, *t;
 	char *tname;
 	int ret = 0, err;
 	u32 hook = 0;
@@ -126,7 +126,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
 	if (tb[TCA_IPT_TARG] == NULL)
 		return -EINVAL;
 
-	td = (struct ipt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
+	td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
 	if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size)
 		return -EINVAL;
 
@@ -230,7 +230,7 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
 		result = TC_ACT_SHOT;
 		ipt->tcf_qstats.drops++;
 		break;
-	case IPT_CONTINUE:
+	case XT_CONTINUE:
 		result = TC_ACT_PIPE;
 		break;
 	default:
@@ -249,7 +249,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_ipt *ipt = a->priv;
-	struct ipt_entry_target *t;
+	struct xt_entry_target *t;
 	struct tcf_t tm;
 	struct tc_cnt c;
 
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 1980b71c283..0c311be9282 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -165,6 +165,8 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
 
 	spin_lock(&m->tcf_lock);
 	m->tcf_tm.lastuse = jiffies;
+	m->tcf_bstats.bytes += qdisc_pkt_len(skb);
+	m->tcf_bstats.packets++;
 
 	dev = m->tcfm_dev;
 	if (!dev) {
@@ -179,13 +181,11 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
 		goto out;
 	}
 
-	skb2 = skb_act_clone(skb, GFP_ATOMIC);
+	at = G_TC_AT(skb->tc_verd);
+	skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action);
 	if (skb2 == NULL)
 		goto out;
 
-	m->tcf_bstats.bytes += qdisc_pkt_len(skb2);
-	m->tcf_bstats.packets++;
-	at = G_TC_AT(skb->tc_verd);
 	if (!(at & AT_EGRESS)) {
 		if (m->tcfm_ok_push)
 			skb_push(skb2, skb2->dev->hard_header_len);
@@ -195,16 +195,14 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
 	if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
 		skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
 
-	skb2->dev = dev;
 	skb2->skb_iif = skb->dev->ifindex;
+	skb2->dev = dev;
 	dev_queue_xmit(skb2);
 	err = 0;
 
 out:
 	if (err) {
 		m->tcf_qstats.overlimits++;
-		m->tcf_bstats.bytes += qdisc_pkt_len(skb);
-		m->tcf_bstats.packets++;
 		/* should we be asking for packet to be dropped?
 		 * may make sense for redirect case only
 		 */
@@ -221,15 +219,16 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_mirred *m = a->priv;
-	struct tc_mirred opt;
+	struct tc_mirred opt = {
+		.index   = m->tcf_index,
+		.action  = m->tcf_action,
+		.refcnt  = m->tcf_refcnt - ref,
+		.bindcnt = m->tcf_bindcnt - bind,
+		.eaction = m->tcfm_eaction,
+		.ifindex = m->tcfm_ifindex,
+	};
 	struct tcf_t t;
 
-	opt.index = m->tcf_index;
-	opt.action = m->tcf_action;
-	opt.refcnt = m->tcf_refcnt - ref;
-	opt.bindcnt = m->tcf_bindcnt - bind;
-	opt.eaction = m->tcfm_eaction;
-	opt.ifindex = m->tcfm_ifindex;
 	NLA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt);
 	t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 724553e8ed7..186eb837e60 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -114,6 +114,7 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
 	int egress;
 	int action;
 	int ihl;
+	int noff;
 
 	spin_lock(&p->tcf_lock);
 
@@ -132,7 +133,8 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
 	if (unlikely(action == TC_ACT_SHOT))
 		goto drop;
 
-	if (!pskb_may_pull(skb, sizeof(*iph)))
+	noff = skb_network_offset(skb);
+	if (!pskb_may_pull(skb, sizeof(*iph) + noff))
 		goto drop;
 
 	iph = ip_hdr(skb);
@@ -144,7 +146,7 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
 
 	if (!((old_addr ^ addr) & mask)) {
 		if (skb_cloned(skb) &&
-		    !skb_clone_writable(skb, sizeof(*iph)) &&
+		    !skb_clone_writable(skb, sizeof(*iph) + noff) &&
 		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
 			goto drop;
 
@@ -172,9 +174,9 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
 	{
 		struct tcphdr *tcph;
 
-		if (!pskb_may_pull(skb, ihl + sizeof(*tcph)) ||
+		if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) ||
 		    (skb_cloned(skb) &&
-		     !skb_clone_writable(skb, ihl + sizeof(*tcph)) &&
+		     !skb_clone_writable(skb, ihl + sizeof(*tcph) + noff) &&
 		     pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
 			goto drop;
 
@@ -186,9 +188,9 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
 	{
 		struct udphdr *udph;
 
-		if (!pskb_may_pull(skb, ihl + sizeof(*udph)) ||
+		if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) ||
 		    (skb_cloned(skb) &&
-		     !skb_clone_writable(skb, ihl + sizeof(*udph)) &&
+		     !skb_clone_writable(skb, ihl + sizeof(*udph) + noff) &&
 		     pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
 			goto drop;
 
@@ -205,7 +207,7 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
 	{
 		struct icmphdr *icmph;
 
-		if (!pskb_may_pull(skb, ihl + sizeof(*icmph)))
+		if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + noff))
 			goto drop;
 
 		icmph = (void *)(skb_network_header(skb) + ihl);
@@ -215,9 +217,11 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
 		    (icmph->type != ICMP_PARAMETERPROB))
 			break;
 
-		if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph)))
+		if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) +
+					noff))
 			goto drop;
 
+		icmph = (void *)(skb_network_header(skb) + ihl);
 		iph = (void *)(icmph + 1);
 		if (egress)
 			addr = iph->daddr;
@@ -228,8 +232,8 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
 			break;
 
 		if (skb_cloned(skb) &&
-		    !skb_clone_writable(skb,
-					ihl + sizeof(*icmph) + sizeof(*iph)) &&
+		    !skb_clone_writable(skb, ihl + sizeof(*icmph) +
+					     sizeof(*iph) + noff) &&
 		    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
 			goto drop;
 
@@ -246,7 +250,7 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
 			iph->saddr = new_addr;
 
 		inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr,
-					 1);
+					 0);
 		break;
 	}
 	default:
@@ -268,40 +272,29 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_nat *p = a->priv;
-	struct tc_nat *opt;
+	struct tc_nat opt = {
+		.old_addr = p->old_addr,
+		.new_addr = p->new_addr,
+		.mask     = p->mask,
+		.flags    = p->flags,
+
+		.index    = p->tcf_index,
+		.action   = p->tcf_action,
+		.refcnt   = p->tcf_refcnt - ref,
+		.bindcnt  = p->tcf_bindcnt - bind,
+	};
 	struct tcf_t t;
-	int s;
-
-	s = sizeof(*opt);
-
-	/* netlink spinlocks held above us - must use ATOMIC */
-	opt = kzalloc(s, GFP_ATOMIC);
-	if (unlikely(!opt))
-		return -ENOBUFS;
 
-	opt->old_addr = p->old_addr;
-	opt->new_addr = p->new_addr;
-	opt->mask = p->mask;
-	opt->flags = p->flags;
-
-	opt->index = p->tcf_index;
-	opt->action = p->tcf_action;
-	opt->refcnt = p->tcf_refcnt - ref;
-	opt->bindcnt = p->tcf_bindcnt - bind;
-
-	NLA_PUT(skb, TCA_NAT_PARMS, s, opt);
+	NLA_PUT(skb, TCA_NAT_PARMS, sizeof(opt), &opt);
 	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
 	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
 	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
 	NLA_PUT(skb, TCA_NAT_TM, sizeof(t), &t);
 
-	kfree(opt);
-
 	return skb->len;
 
 nla_put_failure:
 	nlmsg_trim(skb, b);
-	kfree(opt);
 	return -1;
 }
 
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 50e3d945e1f..a0593c9640d 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -127,8 +127,7 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,
 	int i, munged = 0;
 	unsigned int off;
 
-	if (!(skb->tc_verd & TC_OK2MUNGE)) {
-		/* should we set skb->cloned? */
+	if (skb_cloned(skb)) {
 		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
 			return p->tcf_action;
 		}
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 654f73dff7c..7ebf7439b47 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -97,6 +97,11 @@ nla_put_failure:
 	goto done;
 }
 
+static void tcf_police_free_rcu(struct rcu_head *head)
+{
+	kfree(container_of(head, struct tcf_police, tcf_rcu));
+}
+
 static void tcf_police_destroy(struct tcf_police *p)
 {
 	unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK);
@@ -113,7 +118,11 @@ static void tcf_police_destroy(struct tcf_police *p)
 				qdisc_put_rtab(p->tcfp_R_tab);
 			if (p->tcfp_P_tab)
 				qdisc_put_rtab(p->tcfp_P_tab);
-			kfree(p);
+			/*
+			 * gen_estimator est_timer() might access p->tcf_lock
+			 * or bstats, wait a RCU grace period before freeing p
+			 */
+			call_rcu(&p->tcf_rcu, tcf_police_free_rcu);
 			return;
 		}
 	}
@@ -341,22 +350,19 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_police *police = a->priv;
-	struct tc_police opt;
-
-	opt.index = police->tcf_index;
-	opt.action = police->tcf_action;
-	opt.mtu = police->tcfp_mtu;
-	opt.burst = police->tcfp_burst;
-	opt.refcnt = police->tcf_refcnt - ref;
-	opt.bindcnt = police->tcf_bindcnt - bind;
+	struct tc_police opt = {
+		.index = police->tcf_index,
+		.action = police->tcf_action,
+		.mtu = police->tcfp_mtu,
+		.burst = police->tcfp_burst,
+		.refcnt = police->tcf_refcnt - ref,
+		.bindcnt = police->tcf_bindcnt - bind,
+	};
+
 	if (police->tcfp_R_tab)
 		opt.rate = police->tcfp_R_tab->rate;
-	else
-		memset(&opt.rate, 0, sizeof(opt.rate));
 	if (police->tcfp_P_tab)
 		opt.peakrate = police->tcfp_P_tab->rate;
-	else
-		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 	NLA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
 	if (police->tcfp_result)
 		NLA_PUT_U32(skb, TCA_POLICE_RESULT, police->tcfp_result);
@@ -397,6 +403,7 @@ static void __exit
 police_cleanup_module(void)
 {
 	tcf_unregister_action(&act_police_ops);
+	rcu_barrier(); /* Wait for completion of call_rcu()'s (tcf_police_free_rcu) */
 }
 
 module_init(police_init_module);
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 1b4bc691d7d..97e84f3ee77 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -73,10 +73,10 @@ static int tcf_simp_release(struct tcf_defact *d, int bind)
 
 static int alloc_defdata(struct tcf_defact *d, char *defdata)
 {
-	d->tcfd_defdata = kstrndup(defdata, SIMP_MAX_DATA, GFP_KERNEL);
+	d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL);
 	if (unlikely(!d->tcfd_defdata))
 		return -ENOMEM;
-
+	strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
 	return 0;
 }
 
@@ -164,13 +164,14 @@ static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_defact *d = a->priv;
-	struct tc_defact opt;
+	struct tc_defact opt = {
+		.index   = d->tcf_index,
+		.refcnt  = d->tcf_refcnt - ref,
+		.bindcnt = d->tcf_bindcnt - bind,
+		.action  = d->tcf_action,
+	};
 	struct tcf_t t;
 
-	opt.index = d->tcf_index;
-	opt.refcnt = d->tcf_refcnt - ref;
-	opt.bindcnt = d->tcf_bindcnt - bind;
-	opt.action = d->tcf_action;
 	NLA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt);
 	NLA_PUT_STRING(skb, TCA_DEF_DATA, d->tcfd_defdata);
 	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index e9607fe55b5..66cbf4eb885 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -159,13 +159,14 @@ static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_skbedit *d = a->priv;
-	struct tc_skbedit opt;
+	struct tc_skbedit opt = {
+		.index   = d->tcf_index,
+		.refcnt  = d->tcf_refcnt - ref,
+		.bindcnt = d->tcf_bindcnt - bind,
+		.action  = d->tcf_action,
+	};
 	struct tcf_t t;
 
-	opt.index = d->tcf_index;
-	opt.refcnt = d->tcf_refcnt - ref;
-	opt.bindcnt = d->tcf_bindcnt - bind;
-	opt.action = d->tcf_action;
 	NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt);
 	if (d->flags & SKBEDIT_F_PRIORITY)
 		NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 78ef2c5e130..37dff78e9cb 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -123,7 +123,7 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
 	 * calls by looking at the number of nested bh disable calls because
 	 * softirqs always disables bh.
 	 */
-	if (softirq_count() != SOFTIRQ_OFFSET) {
+	if (in_serving_softirq()) {
 		/* If there is an sk_classid we'll use that. */
 		if (!skb->sk)
 			return -1;
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index f73542d2cdd..5b271a18bc3 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -65,109 +65,134 @@ static inline u32 addr_fold(void *addr)
 	return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0);
 }
 
-static u32 flow_get_src(const struct sk_buff *skb)
+static u32 flow_get_src(struct sk_buff *skb)
 {
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
-		return ntohl(ip_hdr(skb)->saddr);
+		if (pskb_network_may_pull(skb, sizeof(struct iphdr)))
+			return ntohl(ip_hdr(skb)->saddr);
+		break;
 	case htons(ETH_P_IPV6):
-		return ntohl(ipv6_hdr(skb)->saddr.s6_addr32[3]);
-	default:
-		return addr_fold(skb->sk);
+		if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
+			return ntohl(ipv6_hdr(skb)->saddr.s6_addr32[3]);
+		break;
 	}
+
+	return addr_fold(skb->sk);
 }
 
-static u32 flow_get_dst(const struct sk_buff *skb)
+static u32 flow_get_dst(struct sk_buff *skb)
 {
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
-		return ntohl(ip_hdr(skb)->daddr);
+		if (pskb_network_may_pull(skb, sizeof(struct iphdr)))
+			return ntohl(ip_hdr(skb)->daddr);
+		break;
 	case htons(ETH_P_IPV6):
-		return ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]);
-	default:
-		return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
+		if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
+			return ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]);
+		break;
 	}
+
+	return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
 }
 
-static u32 flow_get_proto(const struct sk_buff *skb)
+static u32 flow_get_proto(struct sk_buff *skb)
 {
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
-		return ip_hdr(skb)->protocol;
+		return pskb_network_may_pull(skb, sizeof(struct iphdr)) ?
+		       ip_hdr(skb)->protocol : 0;
 	case htons(ETH_P_IPV6):
-		return ipv6_hdr(skb)->nexthdr;
+		return pskb_network_may_pull(skb, sizeof(struct ipv6hdr)) ?
+		       ipv6_hdr(skb)->nexthdr : 0;
 	default:
 		return 0;
 	}
 }
 
-static int has_ports(u8 protocol)
+static u32 flow_get_proto_src(struct sk_buff *skb)
 {
-	switch (protocol) {
-	case IPPROTO_TCP:
-	case IPPROTO_UDP:
-	case IPPROTO_UDPLITE:
-	case IPPROTO_SCTP:
-	case IPPROTO_DCCP:
-	case IPPROTO_ESP:
-		return 1;
-	default:
-		return 0;
-	}
-}
-
-static u32 flow_get_proto_src(const struct sk_buff *skb)
-{
-	u32 res = 0;
-
 	switch (skb->protocol) {
 	case htons(ETH_P_IP): {
-		struct iphdr *iph = ip_hdr(skb);
+		struct iphdr *iph;
+		int poff;
 
-		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
-		    has_ports(iph->protocol))
-			res = ntohs(*(__be16 *)((void *)iph + iph->ihl * 4));
+		if (!pskb_network_may_pull(skb, sizeof(*iph)))
+			break;
+		iph = ip_hdr(skb);
+		if (iph->frag_off & htons(IP_MF|IP_OFFSET))
+			break;
+		poff = proto_ports_offset(iph->protocol);
+		if (poff >= 0 &&
+		    pskb_network_may_pull(skb, iph->ihl * 4 + 2 + poff)) {
+			iph = ip_hdr(skb);
+			return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 +
+						 poff));
+		}
 		break;
 	}
 	case htons(ETH_P_IPV6): {
-		struct ipv6hdr *iph = ipv6_hdr(skb);
+		struct ipv6hdr *iph;
+		int poff;
 
-		if (has_ports(iph->nexthdr))
-			res = ntohs(*(__be16 *)&iph[1]);
+		if (!pskb_network_may_pull(skb, sizeof(*iph)))
+			break;
+		iph = ipv6_hdr(skb);
+		poff = proto_ports_offset(iph->nexthdr);
+		if (poff >= 0 &&
+		    pskb_network_may_pull(skb, sizeof(*iph) + poff + 2)) {
+			iph = ipv6_hdr(skb);
+			return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) +
+						 poff));
+		}
 		break;
 	}
-	default:
-		res = addr_fold(skb->sk);
 	}
 
-	return res;
+	return addr_fold(skb->sk);
 }
 
-static u32 flow_get_proto_dst(const struct sk_buff *skb)
+static u32 flow_get_proto_dst(struct sk_buff *skb)
 {
-	u32 res = 0;
-
 	switch (skb->protocol) {
 	case htons(ETH_P_IP): {
-		struct iphdr *iph = ip_hdr(skb);
+		struct iphdr *iph;
+		int poff;
 
-		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
-		    has_ports(iph->protocol))
-			res = ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + 2));
+		if (!pskb_network_may_pull(skb, sizeof(*iph)))
+			break;
+		iph = ip_hdr(skb);
+		if (iph->frag_off & htons(IP_MF|IP_OFFSET))
+			break;
+		poff = proto_ports_offset(iph->protocol);
+		if (poff >= 0 &&
+		    pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
+			iph = ip_hdr(skb);
+			return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 +
+						 2 + poff));
+		}
 		break;
 	}
 	case htons(ETH_P_IPV6): {
-		struct ipv6hdr *iph = ipv6_hdr(skb);
+		struct ipv6hdr *iph;
+		int poff;
 
-		if (has_ports(iph->nexthdr))
-			res = ntohs(*(__be16 *)((void *)&iph[1] + 2));
+		if (!pskb_network_may_pull(skb, sizeof(*iph)))
+			break;
+		iph = ipv6_hdr(skb);
+		poff = proto_ports_offset(iph->nexthdr);
+		if (poff >= 0 &&
+		    pskb_network_may_pull(skb, sizeof(*iph) + poff + 4)) {
+			iph = ipv6_hdr(skb);
+			return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) +
+						 poff + 2));
+		}
 		break;
 	}
-	default:
-		res = addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
 	}
 
-	return res;
+	return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
 }
 
 static u32 flow_get_iif(const struct sk_buff *skb)
@@ -211,7 +236,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb)
 })
 #endif
 
-static u32 flow_get_nfct_src(const struct sk_buff *skb)
+static u32 flow_get_nfct_src(struct sk_buff *skb)
 {
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
@@ -223,7 +248,7 @@ fallback:
 	return flow_get_src(skb);
 }
 
-static u32 flow_get_nfct_dst(const struct sk_buff *skb)
+static u32 flow_get_nfct_dst(struct sk_buff *skb)
 {
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
@@ -235,14 +260,14 @@ fallback:
 	return flow_get_dst(skb);
 }
 
-static u32 flow_get_nfct_proto_src(const struct sk_buff *skb)
+static u32 flow_get_nfct_proto_src(struct sk_buff *skb)
 {
 	return ntohs(CTTUPLE(skb, src.u.all));
 fallback:
 	return flow_get_proto_src(skb);
 }
 
-static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb)
+static u32 flow_get_nfct_proto_dst(struct sk_buff *skb)
 {
 	return ntohs(CTTUPLE(skb, dst.u.all));
 fallback:
@@ -281,7 +306,12 @@ static u32 flow_get_vlan_tag(const struct sk_buff *skb)
 	return tag & VLAN_VID_MASK;
 }
 
-static u32 flow_key_get(const struct sk_buff *skb, int key)
+static u32 flow_get_rxhash(struct sk_buff *skb)
+{
+	return skb_get_rxhash(skb);
+}
+
+static u32 flow_key_get(struct sk_buff *skb, int key)
 {
 	switch (key) {
 	case FLOW_KEY_SRC:
@@ -318,6 +348,8 @@ static u32 flow_key_get(const struct sk_buff *skb, int key)
 		return flow_get_skgid(skb);
 	case FLOW_KEY_VLAN_TAG:
 		return flow_get_vlan_tag(skb);
+	case FLOW_KEY_RXHASH:
+		return flow_get_rxhash(skb);
 	default:
 		WARN_ON(1);
 		return 0;
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index dd9414e4420..425a1790b04 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -143,9 +143,17 @@ static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp,
 	u8 tunnelid = 0;
 	u8 *xprt;
 #if RSVP_DST_LEN == 4
-	struct ipv6hdr *nhptr = ipv6_hdr(skb);
+	struct ipv6hdr *nhptr;
+
+	if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
+		return -1;
+	nhptr = ipv6_hdr(skb);
 #else
-	struct iphdr *nhptr = ip_hdr(skb);
+	struct iphdr *nhptr;
+
+	if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
+		return -1;
+	nhptr = ip_hdr(skb);
 #endif
 
 restart:
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 4f522143811..b0c2a82178a 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -134,10 +134,12 @@ next_knode:
 #endif
 
 		for (i = n->sel.nkeys; i>0; i--, key++) {
-			unsigned int toff;
+			int toff = off + key->off + (off2 & key->offmask);
 			__be32 *data, _data;
 
-			toff = off + key->off + (off2 & key->offmask);
+			if (skb_headroom(skb) + toff > INT_MAX)
+				goto out;
+
 			data = skb_header_pointer(skb, toff, 4, &_data);
 			if (!data)
 				goto out;
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 3bcac8aa333..34da5e29ea1 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -223,6 +223,11 @@ META_COLLECTOR(int_maclen)
 	dst->value = skb->mac_len;
 }
 
+META_COLLECTOR(int_rxhash)
+{
+	dst->value = skb_get_rxhash(skb);
+}
+
 /**************************************************************************
  * Netfilter
  **************************************************************************/
@@ -541,6 +546,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 		[META_ID(SK_SENDMSG_OFF)]	= META_FUNC(int_sk_sendmsg_off),
 		[META_ID(SK_WRITE_PENDING)]	= META_FUNC(int_sk_write_pend),
 		[META_ID(VLAN_TAG)]		= META_FUNC(int_vlan_tag),
+		[META_ID(RXHASH)]		= META_FUNC(int_rxhash),
 	}
 };
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b9e8c3b7d40..b22ca2d1ceb 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -150,22 +150,34 @@ int register_qdisc(struct Qdisc_ops *qops)
 	if (qops->enqueue == NULL)
 		qops->enqueue = noop_qdisc_ops.enqueue;
 	if (qops->peek == NULL) {
-		if (qops->dequeue == NULL) {
+		if (qops->dequeue == NULL)
 			qops->peek = noop_qdisc_ops.peek;
-		} else {
-			rc = -EINVAL;
-			goto out;
-		}
+		else
+			goto out_einval;
 	}
 	if (qops->dequeue == NULL)
 		qops->dequeue = noop_qdisc_ops.dequeue;
 
+	if (qops->cl_ops) {
+		const struct Qdisc_class_ops *cops = qops->cl_ops;
+
+		if (!(cops->get && cops->put && cops->walk && cops->leaf))
+			goto out_einval;
+
+		if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
+			goto out_einval;
+	}
+
 	qops->next = NULL;
 	*qp = qops;
 	rc = 0;
 out:
 	write_unlock(&qdisc_mod_lock);
 	return rc;
+
+out_einval:
+	rc = -EINVAL;
+	goto out;
 }
 EXPORT_SYMBOL(register_qdisc);
 
@@ -228,7 +240,10 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 	if (q)
 		goto out;
 
-	q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
+	if (dev_ingress_queue(dev))
+		q = qdisc_match_from_root(
+			dev_ingress_queue(dev)->qdisc_sleeping,
+			handle);
 out:
 	return q;
 }
@@ -348,7 +363,7 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
 	}
 
-	if (!s || tsize != s->tsize || (!tab && tsize > 0))
+	if (tsize != s->tsize || (!tab && tsize > 0))
 		return ERR_PTR(-EINVAL);
 
 	spin_lock(&qdisc_stab_lock);
@@ -678,6 +693,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		    (new && new->flags & TCQ_F_INGRESS)) {
 			num_q = 1;
 			ingress = 1;
+			if (!dev_ingress_queue(dev))
+				return -ENOENT;
 		}
 
 		if (dev->flags & IFF_UP)
@@ -689,7 +706,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		}
 
 		for (i = 0; i < num_q; i++) {
-			struct netdev_queue *dev_queue = &dev->rx_queue;
+			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
 
 			if (!ingress)
 				dev_queue = netdev_get_tx_queue(dev, i);
@@ -967,7 +984,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 					return -ENOENT;
 				q = qdisc_leaf(p, clid);
 			} else { /* ingress */
-				q = dev->rx_queue.qdisc_sleeping;
+				if (dev_ingress_queue(dev))
+					q = dev_ingress_queue(dev)->qdisc_sleeping;
 			}
 		} else {
 			q = dev->qdisc;
@@ -1031,8 +1049,9 @@ replay:
 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
 					return -ENOENT;
 				q = qdisc_leaf(p, clid);
-			} else { /*ingress */
-				q = dev->rx_queue.qdisc_sleeping;
+			} else { /* ingress */
+				if (dev_ingress_queue_create(dev))
+					q = dev_ingress_queue(dev)->qdisc_sleeping;
 			}
 		} else {
 			q = dev->qdisc;
@@ -1111,11 +1130,14 @@ replay:
 create_n_graft:
 	if (!(n->nlmsg_flags&NLM_F_CREATE))
 		return -ENOENT;
-	if (clid == TC_H_INGRESS)
-		q = qdisc_create(dev, &dev->rx_queue, p,
-				 tcm->tcm_parent, tcm->tcm_parent,
-				 tca, &err);
-	else {
+	if (clid == TC_H_INGRESS) {
+		if (dev_ingress_queue(dev))
+			q = qdisc_create(dev, dev_ingress_queue(dev), p,
+					 tcm->tcm_parent, tcm->tcm_parent,
+					 tca, &err);
+		else
+			err = -ENOENT;
+	} else {
 		struct netdev_queue *dev_queue;
 
 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
@@ -1292,8 +1314,10 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
 			goto done;
 
-		dev_queue = &dev->rx_queue;
-		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
+		dev_queue = dev_ingress_queue(dev);
+		if (dev_queue &&
+		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
+				       &q_idx, s_q_idx) < 0)
 			goto done;
 
 cont:
@@ -1583,8 +1607,10 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
 		goto done;
 
-	dev_queue = &dev->rx_queue;
-	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
+	dev_queue = dev_ingress_queue(dev);
+	if (dev_queue &&
+	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
+				&t, s_t) < 0)
 		goto done;
 
 done:
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index fcbb86a486a..282540778aa 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -52,7 +52,7 @@ struct atm_flow_data {
 	int			ref;		/* reference count */
 	struct gnet_stats_basic_packed	bstats;
 	struct gnet_stats_queue	qstats;
-	struct atm_flow_data	*next;
+	struct list_head	list;
 	struct atm_flow_data	*excess;	/* flow for excess traffic;
 						   NULL to set CLP instead */
 	int			hdr_len;
@@ -61,34 +61,23 @@ struct atm_flow_data {
 
 struct atm_qdisc_data {
 	struct atm_flow_data	link;		/* unclassified skbs go here */
-	struct atm_flow_data	*flows;		/* NB: "link" is also on this
+	struct list_head	flows;		/* NB: "link" is also on this
 						   list */
 	struct tasklet_struct	task;		/* dequeue tasklet */
 };
 
 /* ------------------------- Class/flow operations ------------------------- */
 
-static int find_flow(struct atm_qdisc_data *qdisc, struct atm_flow_data *flow)
-{
-	struct atm_flow_data *walk;
-
-	pr_debug("find_flow(qdisc %p,flow %p)\n", qdisc, flow);
-	for (walk = qdisc->flows; walk; walk = walk->next)
-		if (walk == flow)
-			return 1;
-	pr_debug("find_flow: not found\n");
-	return 0;
-}
-
 static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid)
 {
 	struct atm_qdisc_data *p = qdisc_priv(sch);
 	struct atm_flow_data *flow;
 
-	for (flow = p->flows; flow; flow = flow->next)
+	list_for_each_entry(flow, &p->flows, list) {
 		if (flow->classid == classid)
-			break;
-	return flow;
+			return flow;
+	}
+	return NULL;
 }
 
 static int atm_tc_graft(struct Qdisc *sch, unsigned long arg,
@@ -99,7 +88,7 @@ static int atm_tc_graft(struct Qdisc *sch, unsigned long arg,
 
 	pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",
 		sch, p, flow, new, old);
-	if (!find_flow(p, flow))
+	if (list_empty(&flow->list))
 		return -EINVAL;
 	if (!new)
 		new = &noop_qdisc;
@@ -146,20 +135,12 @@ static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
 {
 	struct atm_qdisc_data *p = qdisc_priv(sch);
 	struct atm_flow_data *flow = (struct atm_flow_data *)cl;
-	struct atm_flow_data **prev;
 
 	pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
 	if (--flow->ref)
 		return;
 	pr_debug("atm_tc_put: destroying\n");
-	for (prev = &p->flows; *prev; prev = &(*prev)->next)
-		if (*prev == flow)
-			break;
-	if (!*prev) {
-		printk(KERN_CRIT "atm_tc_put: class %p not found\n", flow);
-		return;
-	}
-	*prev = flow->next;
+	list_del_init(&flow->list);
 	pr_debug("atm_tc_put: qdisc %p\n", flow->q);
 	qdisc_destroy(flow->q);
 	tcf_destroy_chain(&flow->filter_list);
@@ -274,10 +255,6 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
 			error = -EINVAL;
 			goto err_out;
 		}
-		if (find_flow(p, flow)) {
-			error = -EEXIST;
-			goto err_out;
-		}
 	} else {
 		int i;
 		unsigned long cl;
@@ -298,8 +275,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
 		goto err_out;
 	}
 	flow->filter_list = NULL;
-	flow->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
-				    &pfifo_qdisc_ops, classid);
+	flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
 	if (!flow->q)
 		flow->q = &noop_qdisc;
 	pr_debug("atm_tc_change: qdisc %p\n", flow->q);
@@ -313,8 +289,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
 	flow->classid = classid;
 	flow->ref = 1;
 	flow->excess = excess;
-	flow->next = p->link.next;
-	p->link.next = flow;
+	list_add(&flow->list, &p->link.list);
 	flow->hdr_len = hdr_len;
 	if (hdr)
 		memcpy(flow->hdr, hdr, hdr_len);
@@ -335,7 +310,7 @@ static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
 	struct atm_flow_data *flow = (struct atm_flow_data *)arg;
 
 	pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
-	if (!find_flow(qdisc_priv(sch), flow))
+	if (list_empty(&flow->list))
 		return -EINVAL;
 	if (flow->filter_list || flow == &p->link)
 		return -EBUSY;
@@ -361,12 +336,12 @@ static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 	pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
 	if (walker->stop)
 		return;
-	for (flow = p->flows; flow; flow = flow->next) {
-		if (walker->count >= walker->skip)
-			if (walker->fn(sch, (unsigned long)flow, walker) < 0) {
-				walker->stop = 1;
-				break;
-			}
+	list_for_each_entry(flow, &p->flows, list) {
+		if (walker->count >= walker->skip &&
+		    walker->fn(sch, (unsigned long)flow, walker) < 0) {
+			walker->stop = 1;
+			break;
+		}
 		walker->count++;
 	}
 }
@@ -385,16 +360,17 @@ static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch, unsigned long cl)
 static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct atm_qdisc_data *p = qdisc_priv(sch);
-	struct atm_flow_data *flow = NULL;	/* @@@ */
+	struct atm_flow_data *flow;
 	struct tcf_result res;
 	int result;
 	int ret = NET_XMIT_POLICED;
 
 	pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
 	result = TC_POLICE_OK;	/* be nice to gcc */
+	flow = NULL;
 	if (TC_H_MAJ(skb->priority) != sch->handle ||
-	    !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority)))
-		for (flow = p->flows; flow; flow = flow->next)
+	    !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) {
+		list_for_each_entry(flow, &p->flows, list) {
 			if (flow->filter_list) {
 				result = tc_classify_compat(skb,
 							    flow->filter_list,
@@ -404,8 +380,13 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 				flow = (struct atm_flow_data *)res.class;
 				if (!flow)
 					flow = lookup_flow(sch, res.classid);
-				break;
+				goto done;
 			}
+		}
+		flow = NULL;
+	done:
+		;		
+	}
 	if (!flow)
 		flow = &p->link;
 	else {
@@ -432,7 +413,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	}
 
 	ret = qdisc_enqueue(skb, flow->q);
-	if (ret != 0) {
+	if (ret != NET_XMIT_SUCCESS) {
 drop: __maybe_unused
 		if (net_xmit_drop_count(ret)) {
 			sch->qstats.drops++;
@@ -456,7 +437,7 @@ drop: __maybe_unused
 	 */
 	if (flow == &p->link) {
 		sch->q.qlen++;
-		return 0;
+		return NET_XMIT_SUCCESS;
 	}
 	tasklet_schedule(&p->task);
 	return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
@@ -477,7 +458,9 @@ static void sch_atm_dequeue(unsigned long data)
 	struct sk_buff *skb;
 
 	pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p);
-	for (flow = p->link.next; flow; flow = flow->next)
+	list_for_each_entry(flow, &p->flows, list) {
+		if (flow == &p->link)
+			continue;
 		/*
 		 * If traffic is properly shaped, this won't generate nasty
 		 * little bursts. Otherwise, it may ... (but that's okay)
@@ -512,6 +495,7 @@ static void sch_atm_dequeue(unsigned long data)
 			/* atm.atm_options are already set by atm_tc_enqueue */
 			flow->vcc->send(flow->vcc, skb);
 		}
+	}
 }
 
 static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch)
@@ -543,9 +527,10 @@ static unsigned int atm_tc_drop(struct Qdisc *sch)
 	unsigned int len;
 
 	pr_debug("atm_tc_drop(sch %p,[qdisc %p])\n", sch, p);
-	for (flow = p->flows; flow; flow = flow->next)
+	list_for_each_entry(flow, &p->flows, list) {
 		if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q)))
 			return len;
+	}
 	return 0;
 }
 
@@ -554,8 +539,10 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
 	struct atm_qdisc_data *p = qdisc_priv(sch);
 
 	pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
-	p->flows = &p->link;
-	p->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+	INIT_LIST_HEAD(&p->flows);
+	INIT_LIST_HEAD(&p->link.list);
+	list_add(&p->link.list, &p->flows);
+	p->link.q = qdisc_create_dflt(sch->dev_queue,
 				      &pfifo_qdisc_ops, sch->handle);
 	if (!p->link.q)
 		p->link.q = &noop_qdisc;
@@ -565,7 +552,6 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
 	p->link.sock = NULL;
 	p->link.classid = sch->handle;
 	p->link.ref = 1;
-	p->link.next = NULL;
 	tasklet_init(&p->task, sch_atm_dequeue, (unsigned long)sch);
 	return 0;
 }
@@ -576,7 +562,7 @@ static void atm_tc_reset(struct Qdisc *sch)
 	struct atm_flow_data *flow;
 
 	pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p);
-	for (flow = p->flows; flow; flow = flow->next)
+	list_for_each_entry(flow, &p->flows, list)
 		qdisc_reset(flow->q);
 	sch->q.qlen = 0;
 }
@@ -584,24 +570,17 @@ static void atm_tc_reset(struct Qdisc *sch)
 static void atm_tc_destroy(struct Qdisc *sch)
 {
 	struct atm_qdisc_data *p = qdisc_priv(sch);
-	struct atm_flow_data *flow;
+	struct atm_flow_data *flow, *tmp;
 
 	pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p);
-	for (flow = p->flows; flow; flow = flow->next)
+	list_for_each_entry(flow, &p->flows, list)
 		tcf_destroy_chain(&flow->filter_list);
 
-	/* races ? */
-	while ((flow = p->flows)) {
+	list_for_each_entry_safe(flow, tmp, &p->flows, list) {
 		if (flow->ref > 1)
 			printk(KERN_ERR "atm_destroy: %p->ref = %d\n", flow,
 			       flow->ref);
 		atm_tc_put(sch, (unsigned long)flow);
-		if (p->flows == flow) {
-			printk(KERN_ERR "atm_destroy: putting flow %p didn't "
-			       "kill it\n", flow);
-			p->flows = flow->next;	/* brute force */
-			break;
-		}
 	}
 	tasklet_kill(&p->task);
 }
@@ -615,7 +594,7 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
 
 	pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n",
 		sch, p, flow, skb, tcm);
-	if (!find_flow(p, flow))
+	if (list_empty(&flow->list))
 		return -EINVAL;
 	tcm->tcm_handle = flow->classid;
 	tcm->tcm_info = flow->q->handle;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 28c01ef5abc..eb763159086 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1379,9 +1379,9 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->link.sibling = &q->link;
 	q->link.common.classid = sch->handle;
 	q->link.qdisc = sch;
-	if (!(q->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
-					    &pfifo_qdisc_ops,
-					    sch->handle)))
+	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+				      sch->handle);
+	if (!q->link.q)
 		q->link.q = &noop_qdisc;
 
 	q->link.priority = TC_CBQ_MAXPRIO-1;
@@ -1623,7 +1623,7 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	struct cbq_class *cl = (struct cbq_class*)arg;
 
 	if (new == NULL) {
-		new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+		new = qdisc_create_dflt(sch->dev_queue,
 					&pfifo_qdisc_ops, cl->common.classid);
 		if (new == NULL)
 			return -ENOBUFS;
@@ -1874,8 +1874,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 	cl->R_tab = rtab;
 	rtab = NULL;
 	cl->refcnt = 1;
-	if (!(cl->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
-					&pfifo_qdisc_ops, classid)))
+	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
+	if (!cl->q)
 		cl->q = &noop_qdisc;
 	cl->common.classid = classid;
 	cl->tparent = parent;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index b74046a9539..aa8b5313f8c 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -110,7 +110,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	cl->refcnt	   = 1;
 	cl->common.classid = classid;
 	cl->quantum	   = quantum;
-	cl->qdisc	   = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+	cl->qdisc	   = qdisc_create_dflt(sch->dev_queue,
 					       &pfifo_qdisc_ops, classid);
 	if (cl->qdisc == NULL)
 		cl->qdisc = &noop_qdisc;
@@ -218,7 +218,7 @@ static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
 	struct drr_class *cl = (struct drr_class *)arg;
 
 	if (new == NULL) {
-		new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+		new = qdisc_create_dflt(sch->dev_queue,
 					&pfifo_qdisc_ops, cl->common.classid);
 		if (new == NULL)
 			new = &noop_qdisc;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 63d41f86679..1d295d62bb5 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -61,8 +61,7 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
 		sch, p, new, old);
 
 	if (new == NULL) {
-		new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
-					&pfifo_qdisc_ops,
+		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
 					sch->handle);
 		if (new == NULL)
 			new = &noop_qdisc;
@@ -384,8 +383,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
 	p->default_index = default_index;
 	p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]);
 
-	p->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
-				 &pfifo_qdisc_ops, sch->handle);
+	p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle);
 	if (p->q == NULL)
 		p->q = &noop_qdisc;
 
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 5948bafa8ce..4dfecb0cba3 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -172,8 +172,7 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
 	struct Qdisc *q;
 	int err = -ENOMEM;
 
-	q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
-			      ops, TC_H_MAKE(sch->handle, 1));
+	q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1));
 	if (q) {
 		err = fifo_set_limit(q, limit);
 		if (err < 0) {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index a63029ef3ed..5dbb3cd96e5 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -96,7 +96,7 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb,
 		 * Another cpu is holding lock, requeue & delay xmits for
 		 * some time.
 		 */
-		__get_cpu_var(softnet_data).cpu_collision++;
+		__this_cpu_inc(softnet_data.cpu_collision);
 		ret = dev_requeue_skb(skb, q);
 	}
 
@@ -205,7 +205,7 @@ void __qdisc_run(struct Qdisc *q)
 		}
 	}
 
-	clear_bit(__QDISC_STATE_RUNNING, &q->state);
+	qdisc_run_end(q);
 }
 
 unsigned long dev_trans_start(struct net_device *dev)
@@ -327,6 +327,24 @@ void netif_carrier_off(struct net_device *dev)
 }
 EXPORT_SYMBOL(netif_carrier_off);
 
+/**
+ * 	netif_notify_peers - notify network peers about existence of @dev
+ * 	@dev: network device
+ *
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
+ */
+void netif_notify_peers(struct net_device *dev)
+{
+	rtnl_lock();
+	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL(netif_notify_peers);
+
 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
    under all circumstances. It is difficult to invent anything faster or
    cheaper.
@@ -365,6 +383,7 @@ struct Qdisc noop_qdisc = {
 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
 	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
 	.dev_queue	=	&noop_netdev_queue,
+	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
 };
 EXPORT_SYMBOL(noop_qdisc);
 
@@ -391,6 +410,7 @@ static struct Qdisc noqueue_qdisc = {
 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
 	.q.lock		=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
 	.dev_queue	=	&noqueue_netdev_queue,
+	.busylock	=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock),
 };
 
 
@@ -543,6 +563,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 
 	INIT_LIST_HEAD(&sch->list);
 	skb_queue_head_init(&sch->q);
+	spin_lock_init(&sch->busylock);
 	sch->ops = ops;
 	sch->enqueue = ops->enqueue;
 	sch->dequeue = ops->dequeue;
@@ -555,10 +576,8 @@ errout:
 	return ERR_PTR(err);
 }
 
-struct Qdisc * qdisc_create_dflt(struct net_device *dev,
-				 struct netdev_queue *dev_queue,
-				 struct Qdisc_ops *ops,
-				 unsigned int parentid)
+struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
+				struct Qdisc_ops *ops, unsigned int parentid)
 {
 	struct Qdisc *sch;
 
@@ -663,7 +682,7 @@ static void attach_one_default_qdisc(struct net_device *dev,
 	struct Qdisc *qdisc;
 
 	if (dev->tx_queue_len) {
-		qdisc = qdisc_create_dflt(dev, dev_queue,
+		qdisc = qdisc_create_dflt(dev_queue,
 					  &pfifo_fast_ops, TC_H_ROOT);
 		if (!qdisc) {
 			printk(KERN_INFO "%s: activation failed\n", dev->name);
@@ -690,7 +709,7 @@ static void attach_default_qdiscs(struct net_device *dev)
 		dev->qdisc = txq->qdisc_sleeping;
 		atomic_inc(&dev->qdisc->refcnt);
 	} else {
-		qdisc = qdisc_create_dflt(dev, txq, &mq_qdisc_ops, TC_H_ROOT);
+		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
 		if (qdisc) {
 			qdisc->ops->attach(qdisc);
 			dev->qdisc = qdisc;
@@ -734,7 +753,8 @@ void dev_activate(struct net_device *dev)
 
 	need_watchdog = 0;
 	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
-	transition_one_qdisc(dev, &dev->rx_queue, NULL);
+	if (dev_ingress_queue(dev))
+		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
 
 	if (need_watchdog) {
 		dev->trans_start = jiffies;
@@ -779,7 +799,7 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 
 		spin_lock_bh(root_lock);
 
-		val = (test_bit(__QDISC_STATE_RUNNING, &q->state) ||
+		val = (qdisc_is_running(q) ||
 		       test_bit(__QDISC_STATE_SCHED, &q->state));
 
 		spin_unlock_bh(root_lock);
@@ -793,7 +813,8 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 void dev_deactivate(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
-	dev_deactivate_queue(dev, &dev->rx_queue, &noop_qdisc);
+	if (dev_ingress_queue(dev))
+		dev_deactivate_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
 
 	dev_watchdog_down(dev);
 
@@ -819,7 +840,8 @@ void dev_init_scheduler(struct net_device *dev)
 {
 	dev->qdisc = &noop_qdisc;
 	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
-	dev_init_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
+	if (dev_ingress_queue(dev))
+		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
 
 	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
 }
@@ -842,7 +864,8 @@ static void shutdown_scheduler_queue(struct net_device *dev,
 void dev_shutdown(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
-	shutdown_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
+	if (dev_ingress_queue(dev))
+		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
 	qdisc_destroy(dev->qdisc);
 	dev->qdisc = &noop_qdisc;
 
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index abd904be428..069c62b7bb3 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -761,8 +761,8 @@ init_vf(struct hfsc_class *cl, unsigned int len)
 		if (f != cl->cl_f) {
 			cl->cl_f = f;
 			cftree_update(cl);
-			update_cfmin(cl->cl_parent);
 		}
+		update_cfmin(cl->cl_parent);
 	}
 }
 
@@ -1088,7 +1088,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	cl->refcnt    = 1;
 	cl->sched     = q;
 	cl->cl_parent = parent;
-	cl->qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+	cl->qdisc = qdisc_create_dflt(sch->dev_queue,
 				      &pfifo_qdisc_ops, classid);
 	if (cl->qdisc == NULL)
 		cl->qdisc = &noop_qdisc;
@@ -1209,8 +1209,7 @@ hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	if (cl->level > 0)
 		return -EINVAL;
 	if (new == NULL) {
-		new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
-					&pfifo_qdisc_ops,
+		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
 					cl->cl_common.classid);
 		if (new == NULL)
 			new = &noop_qdisc;
@@ -1452,8 +1451,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
 	q->root.cl_common.classid = sch->handle;
 	q->root.refcnt  = 1;
 	q->root.sched   = q;
-	q->root.qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
-					  &pfifo_qdisc_ops,
+	q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
 					  sch->handle);
 	if (q->root.qdisc == NULL)
 		q->root.qdisc = &noop_qdisc;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 0b52b8de562..01b519d6c52 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1121,8 +1121,7 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	if (cl->level)
 		return -EINVAL;
 	if (new == NULL &&
-	    (new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
-				     &pfifo_qdisc_ops,
+	    (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
 				     cl->common.classid)) == NULL)
 		return -ENOBUFS;
 
@@ -1247,8 +1246,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
 		return -EBUSY;
 
 	if (!cl->level && htb_parent_last_child(cl)) {
-		new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
-					  &pfifo_qdisc_ops,
+		new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
 					  cl->parent->common.classid);
 		last_child = 1;
 	}
@@ -1302,14 +1300,14 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 	struct htb_class *cl = (struct htb_class *)*arg, *parent;
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
-	struct nlattr *tb[TCA_HTB_RTAB + 1];
+	struct nlattr *tb[__TCA_HTB_MAX];
 	struct tc_htb_opt *hopt;
 
 	/* extract all subattrs from opt attr */
 	if (!opt)
 		goto failure;
 
-	err = nla_parse_nested(tb, TCA_HTB_RTAB, opt, htb_policy);
+	err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy);
 	if (err < 0)
 		goto failure;
 
@@ -1377,7 +1375,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
 		   so that can't be used inside of sch_tree_lock
 		   -- thanks to Karlis Peisenieks */
-		new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+		new_q = qdisc_create_dflt(sch->dev_queue,
 					  &pfifo_qdisc_ops, classid);
 		sch_tree_lock(sch);
 		if (parent && !parent->level) {
@@ -1550,7 +1548,6 @@ static const struct Qdisc_class_ops htb_class_ops = {
 };
 
 static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
-	.next		=	NULL,
 	.cl_ops		=	&htb_class_ops,
 	.id		=	"htb",
 	.priv_size	=	sizeof(struct htb_sched),
@@ -1561,7 +1558,6 @@ static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
 	.init		=	htb_init,
 	.reset		=	htb_reset,
 	.destroy	=	htb_destroy,
-	.change		=	NULL /* htb_change */,
 	.dump		=	htb_dump,
 	.owner		=	THIS_MODULE,
 };
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index fe91e50f9d9..ecc302f4d2a 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -56,7 +56,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
 
 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
 		dev_queue = netdev_get_tx_queue(dev, ntx);
-		qdisc = qdisc_create_dflt(dev, dev_queue, &pfifo_fast_ops,
+		qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
 					  TC_H_MAKE(TC_H_MAJ(sch->handle),
 						    TC_H_MIN(ntx + 1)));
 		if (qdisc == NULL)
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 6ae251279fc..32690deab5d 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -227,8 +227,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
 	for (i = 0; i < q->bands; i++) {
 		if (q->queues[i] == &noop_qdisc) {
 			struct Qdisc *child, *old;
-			child = qdisc_create_dflt(qdisc_dev(sch),
-						  sch->dev_queue,
+			child = qdisc_create_dflt(sch->dev_queue,
 						  &pfifo_qdisc_ops,
 						  TC_H_MAKE(sch->handle,
 							    i + 1));
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 4714ff162bb..e5593c083a7 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -538,8 +538,7 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 
 	qdisc_watchdog_init(&q->watchdog, sch);
 
-	q->qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
-				     &tfifo_qdisc_ops,
+	q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops,
 				     TC_H_MAKE(sch->handle, 1));
 	if (!q->qdisc) {
 		pr_debug("netem: qdisc create failed\n");
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 0748fb1e3a4..b1c95bce33c 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -200,7 +200,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
 	for (i=0; i<q->bands; i++) {
 		if (q->queues[i] == &noop_qdisc) {
 			struct Qdisc *child, *old;
-			child = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+			child = qdisc_create_dflt(sch->dev_queue,
 						  &pfifo_qdisc_ops,
 						  TC_H_MAKE(sch->handle, i + 1));
 			if (child) {
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index c65762823f5..3cf478d012d 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -122,34 +122,44 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
 	{
-		const struct iphdr *iph = ip_hdr(skb);
+		const struct iphdr *iph;
+		int poff;
+
+		if (!pskb_network_may_pull(skb, sizeof(*iph)))
+			goto err;
+		iph = ip_hdr(skb);
 		h = (__force u32)iph->daddr;
 		h2 = (__force u32)iph->saddr ^ iph->protocol;
-		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
-		    (iph->protocol == IPPROTO_TCP ||
-		     iph->protocol == IPPROTO_UDP ||
-		     iph->protocol == IPPROTO_UDPLITE ||
-		     iph->protocol == IPPROTO_SCTP ||
-		     iph->protocol == IPPROTO_DCCP ||
-		     iph->protocol == IPPROTO_ESP))
-			h2 ^= *(((u32*)iph) + iph->ihl);
+		if (iph->frag_off & htons(IP_MF|IP_OFFSET))
+			break;
+		poff = proto_ports_offset(iph->protocol);
+		if (poff >= 0 &&
+		    pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
+			iph = ip_hdr(skb);
+			h2 ^= *(u32*)((void *)iph + iph->ihl * 4 + poff);
+		}
 		break;
 	}
 	case htons(ETH_P_IPV6):
 	{
-		struct ipv6hdr *iph = ipv6_hdr(skb);
+		struct ipv6hdr *iph;
+		int poff;
+
+		if (!pskb_network_may_pull(skb, sizeof(*iph)))
+			goto err;
+		iph = ipv6_hdr(skb);
 		h = (__force u32)iph->daddr.s6_addr32[3];
 		h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
-		if (iph->nexthdr == IPPROTO_TCP ||
-		    iph->nexthdr == IPPROTO_UDP ||
-		    iph->nexthdr == IPPROTO_UDPLITE ||
-		    iph->nexthdr == IPPROTO_SCTP ||
-		    iph->nexthdr == IPPROTO_DCCP ||
-		    iph->nexthdr == IPPROTO_ESP)
-			h2 ^= *(u32*)&iph[1];
+		poff = proto_ports_offset(iph->nexthdr);
+		if (poff >= 0 &&
+		    pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) {
+			iph = ipv6_hdr(skb);
+			h2 ^= *(u32*)((void *)iph + sizeof(*iph) + poff);
+		}
 		break;
 	}
 	default:
+err:
 		h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
 		h2 = (unsigned long)skb->sk;
 	}
@@ -323,7 +333,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (++sch->q.qlen <= q->limit) {
 		sch->bstats.bytes += qdisc_pkt_len(skb);
 		sch->bstats.packets++;
-		return 0;
+		return NET_XMIT_SUCCESS;
 	}
 
 	sfq_drop(sch);
@@ -497,11 +507,26 @@ nla_put_failure:
 	return -1;
 }
 
+static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 {
 	return 0;
 }
 
+static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
+			      u32 classid)
+{
+	return 0;
+}
+
+static void sfq_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
@@ -554,8 +579,12 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 }
 
 static const struct Qdisc_class_ops sfq_class_ops = {
+	.leaf		=	sfq_leaf,
 	.get		=	sfq_get,
+	.put		=	sfq_put,
 	.tcf_chain	=	sfq_find_tcf,
+	.bind_tcf	=	sfq_bind,
+	.unbind_tcf	=	sfq_put,
 	.dump		=	sfq_dump_class,
 	.dump_stats	=	sfq_dump_class_stats,
 	.walk		=	sfq_walk,
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 0991c640cd3..641a30d6463 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -127,7 +127,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 		return qdisc_reshape_fail(skb, sch);
 
 	ret = qdisc_enqueue(skb, q->qdisc);
-	if (ret != 0) {
+	if (ret != NET_XMIT_SUCCESS) {
 		if (net_xmit_drop_count(ret))
 			sch->qstats.drops++;
 		return ret;
@@ -136,7 +136,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 	sch->q.qlen++;
 	sch->bstats.bytes += qdisc_pkt_len(skb);
 	sch->bstats.packets++;
-	return 0;
+	return NET_XMIT_SUCCESS;
 }
 
 static unsigned int tbf_drop(struct Qdisc* sch)
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 807643bdcba..401af959670 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -85,7 +85,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 		__skb_queue_tail(&q->q, skb);
 		sch->bstats.bytes += qdisc_pkt_len(skb);
 		sch->bstats.packets++;
-		return 0;
+		return NET_XMIT_SUCCESS;
 	}
 
 	kfree_skb(skb);
@@ -241,11 +241,11 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *
 	}
 	if (neigh_event_send(n, skb_res) == 0) {
 		int err;
+		char haddr[MAX_ADDR_LEN];
 
-		read_lock(&n->lock);
-		err = dev_hard_header(skb, dev, ntohs(skb->protocol),
-				      n->ha, NULL, skb->len);
-		read_unlock(&n->lock);
+		neigh_ha_snapshot(haddr, n, dev);
+		err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr,
+				      NULL, skb->len);
 
 		if (err < 0) {
 			neigh_release(n);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index e41feff19e4..5f1fb8bd862 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -48,6 +48,8 @@
  * be incorporated into the next SCTP release.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/types.h>
 #include <linux/fcntl.h>
 #include <linux/poll.h>
@@ -172,7 +174,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
 		(unsigned long)sp->autoclose * HZ;
 
-	/* Initilizes the timers */
+	/* Initializes the timers */
 	for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i)
 		setup_timer(&asoc->timers[i], sctp_timer_events[i],
 				(unsigned long)asoc);
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 86366390038..ddbbf7c81fa 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -543,16 +543,20 @@ struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc)
 		id = ntohs(hmacs->hmac_ids[i]);
 
 		/* Check the id is in the supported range */
-		if (id > SCTP_AUTH_HMAC_ID_MAX)
+		if (id > SCTP_AUTH_HMAC_ID_MAX) {
+			id = 0;
 			continue;
+		}
 
 		/* See is we support the id.  Supported IDs have name and
 		 * length fields set, so that we can allocated and use
 		 * them.  We can safely just check for name, for without the
 		 * name, we can't allocate the TFM.
 		 */
-		if (!sctp_hmac_list[id].hmac_name)
+		if (!sctp_hmac_list[id].hmac_name) {
+			id = 0;
 			continue;
+		}
 
 		break;
 	}
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 476caaf100e..6c8556459a7 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -37,6 +37,8 @@
  * be incorporated into the next SCTP release.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/net.h>
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index ccb6dc48d15..397296fb156 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -43,6 +43,8 @@
  * be incorporated into the next SCTP release.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
 #include <linux/interrupt.h>
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 732689140fb..95e0c8eda1a 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -47,6 +47,8 @@
  * be incorporated into the next SCTP release.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/types.h>
@@ -336,7 +338,7 @@ static void sctp_v6_get_saddr(struct sctp_sock *sk,
 		memcpy(saddr, baddr, sizeof(union sctp_addr));
 		SCTP_DEBUG_PRINTK("saddr: %pI6\n", &saddr->v6.sin6_addr);
 	} else {
-		printk(KERN_ERR "%s: asoc:%p Could not find a valid source "
+		pr_err("%s: asoc:%p Could not find a valid source "
 		       "address for the dest:%pI6\n",
 		       __func__, asoc, &daddr->v6.sin6_addr);
 	}
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index f73ec0ea93b..8ef8e7d9eb6 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -38,6 +38,8 @@
  * be incorporated into the next SCTP release.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <net/sctp/sctp.h>
 
@@ -134,8 +136,7 @@ void sctp_dbg_objcnt_init(void)
 	ent = proc_create("sctp_dbg_objcnt", 0,
 			  proc_net_sctp, &sctp_objcnt_ops);
 	if (!ent)
-		printk(KERN_WARNING
-			"sctp_dbg_objcnt: Unable to create /proc entry.\n");
+		pr_warn("sctp_dbg_objcnt: Unable to create /proc entry.\n");
 }
 
 /* Cleanup the objcount entry in the proc filesystem.  */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index a646681f5ac..60600d337a3 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -41,6 +41,8 @@
  * be incorporated into the next SCTP release.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/wait.h>
@@ -92,7 +94,6 @@ struct sctp_packet *sctp_packet_config(struct sctp_packet *packet,
 	SCTP_DEBUG_PRINTK("%s: packet:%p vtag:0x%x\n", __func__,
 			  packet, vtag);
 
-	sctp_packet_reset(packet);
 	packet->vtag = vtag;
 
 	if (ecn_capable && sctp_packet_empty(packet)) {
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index c04b2eb5918..8c6d379b4bb 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -46,6 +46,8 @@
  * be incorporated into the next SCTP release.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/types.h>
 #include <linux/list.h>   /* For struct list_head */
 #include <linux/socket.h>
@@ -1463,23 +1465,23 @@ static void sctp_check_transmitted(struct sctp_outq *q,
 					/* Display the end of the
 					 * current range.
 					 */
-					SCTP_DEBUG_PRINTK("-%08x",
-							  dbg_last_ack_tsn);
+					SCTP_DEBUG_PRINTK_CONT("-%08x",
+							       dbg_last_ack_tsn);
 				}
 
 				/* Start a new range.  */
-				SCTP_DEBUG_PRINTK(",%08x", tsn);
+				SCTP_DEBUG_PRINTK_CONT(",%08x", tsn);
 				dbg_ack_tsn = tsn;
 				break;
 
 			case 1:	/* The last TSN was NOT ACKed. */
 				if (dbg_last_kept_tsn != dbg_kept_tsn) {
 					/* Display the end of current range. */
-					SCTP_DEBUG_PRINTK("-%08x",
-							  dbg_last_kept_tsn);
+					SCTP_DEBUG_PRINTK_CONT("-%08x",
+							       dbg_last_kept_tsn);
 				}
 
-				SCTP_DEBUG_PRINTK("\n");
+				SCTP_DEBUG_PRINTK_CONT("\n");
 
 				/* FALL THROUGH... */
 			default:
@@ -1526,18 +1528,18 @@ static void sctp_check_transmitted(struct sctp_outq *q,
 					break;
 
 				if (dbg_last_kept_tsn != dbg_kept_tsn)
-					SCTP_DEBUG_PRINTK("-%08x",
-							  dbg_last_kept_tsn);
+					SCTP_DEBUG_PRINTK_CONT("-%08x",
+							       dbg_last_kept_tsn);
 
-				SCTP_DEBUG_PRINTK(",%08x", tsn);
+				SCTP_DEBUG_PRINTK_CONT(",%08x", tsn);
 				dbg_kept_tsn = tsn;
 				break;
 
 			case 0:
 				if (dbg_last_ack_tsn != dbg_ack_tsn)
-					SCTP_DEBUG_PRINTK("-%08x",
-							  dbg_last_ack_tsn);
-				SCTP_DEBUG_PRINTK("\n");
+					SCTP_DEBUG_PRINTK_CONT("-%08x",
+							       dbg_last_ack_tsn);
+				SCTP_DEBUG_PRINTK_CONT("\n");
 
 				/* FALL THROUGH... */
 			default:
@@ -1556,17 +1558,17 @@ static void sctp_check_transmitted(struct sctp_outq *q,
 	switch (dbg_prt_state) {
 	case 0:
 		if (dbg_last_ack_tsn != dbg_ack_tsn) {
-			SCTP_DEBUG_PRINTK("-%08x\n", dbg_last_ack_tsn);
+			SCTP_DEBUG_PRINTK_CONT("-%08x\n", dbg_last_ack_tsn);
 		} else {
-			SCTP_DEBUG_PRINTK("\n");
+			SCTP_DEBUG_PRINTK_CONT("\n");
 		}
 	break;
 
 	case 1:
 		if (dbg_last_kept_tsn != dbg_kept_tsn) {
-			SCTP_DEBUG_PRINTK("-%08x\n", dbg_last_kept_tsn);
+			SCTP_DEBUG_PRINTK_CONT("-%08x\n", dbg_last_kept_tsn);
 		} else {
-			SCTP_DEBUG_PRINTK("\n");
+			SCTP_DEBUG_PRINTK_CONT("\n");
 		}
 	}
 #endif /* SCTP_DEBUG */
diff --git a/net/sctp/probe.c b/net/sctp/probe.c
index db3a42b8b34..bc6cd75cc1d 100644
--- a/net/sctp/probe.c
+++ b/net/sctp/probe.c
@@ -22,6 +22,8 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
 #include <linux/socket.h>
@@ -117,6 +119,7 @@ static const struct file_operations sctpprobe_fops = {
 	.owner	= THIS_MODULE,
 	.open	= sctpprobe_open,
 	.read	= sctpprobe_read,
+	.llseek = noop_llseek,
 };
 
 sctp_disposition_t jsctp_sf_eat_sack(const struct sctp_endpoint *ep,
@@ -192,7 +195,7 @@ static __init int sctpprobe_init(void)
 	if (ret)
 		goto remove_proc;
 
-	pr_info("SCTP probe registered (port=%d)\n", port);
+	pr_info("probe registered (port=%d)\n", port);
 
 	return 0;
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 182749867c7..1ef29c74d85 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -46,6 +46,8 @@
  * be incorporated into the next SCTP release.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/netdevice.h>
@@ -490,7 +492,7 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
 			  __func__, &fl.fl4_dst, &fl.fl4_src);
 
 	if (!ip_route_output_key(&init_net, &rt, &fl)) {
-		dst = &rt->u.dst;
+		dst = &rt->dst;
 	}
 
 	/* If there is no association or if a source address is passed, no
@@ -534,7 +536,7 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
 			fl.fl4_src = laddr->a.v4.sin_addr.s_addr;
 			fl.fl_ip_sport = laddr->a.v4.sin_port;
 			if (!ip_route_output_key(&init_net, &rt, &fl)) {
-				dst = &rt->u.dst;
+				dst = &rt->dst;
 				goto out_unlock;
 			}
 		}
@@ -707,8 +709,7 @@ static int sctp_ctl_sock_init(void)
 					   &init_net);
 
 	if (err < 0) {
-		printk(KERN_ERR
-		       "SCTP: Failed to create the SCTP control socket.\n");
+		pr_err("Failed to create the SCTP control socket\n");
 		return err;
 	}
 	return 0;
@@ -798,7 +799,7 @@ static void sctp_inet_skb_msgname(struct sk_buff *skb, char *msgname, int *len)
 static int sctp_inet_af_supported(sa_family_t family, struct sctp_sock *sp)
 {
 	/* PF_INET only supports AF_INET addresses. */
-	return (AF_INET == family);
+	return AF_INET == family;
 }
 
 /* Address matching with wildcards allowed. */
@@ -1002,7 +1003,8 @@ int sctp_register_pf(struct sctp_pf *pf, sa_family_t family)
 static inline int init_sctp_mibs(void)
 {
 	return snmp_mib_init((void __percpu **)sctp_statistics,
-			     sizeof(struct sctp_mib));
+			     sizeof(struct sctp_mib),
+			     __alignof__(struct sctp_mib));
 }
 
 static inline void cleanup_sctp_mibs(void)
@@ -1162,7 +1164,7 @@ SCTP_STATIC __init int sctp_init(void)
 	/* Set the pressure threshold to be a fraction of global memory that
 	 * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
 	 * memory, with a floor of 128 pages.
-	 * Note this initalizes the data in sctpv6_prot too
+	 * Note this initializes the data in sctpv6_prot too
 	 * Unabashedly stolen from tcp_init
 	 */
 	nr_pages = totalram_pages - totalhigh_pages;
@@ -1205,7 +1207,7 @@ SCTP_STATIC __init int sctp_init(void)
 					__get_free_pages(GFP_ATOMIC, order);
 	} while (!sctp_assoc_hashtable && --order > 0);
 	if (!sctp_assoc_hashtable) {
-		printk(KERN_ERR "SCTP: Failed association hash alloc.\n");
+		pr_err("Failed association hash alloc\n");
 		status = -ENOMEM;
 		goto err_ahash_alloc;
 	}
@@ -1219,7 +1221,7 @@ SCTP_STATIC __init int sctp_init(void)
 	sctp_ep_hashtable = (struct sctp_hashbucket *)
 		kmalloc(64 * sizeof(struct sctp_hashbucket), GFP_KERNEL);
 	if (!sctp_ep_hashtable) {
-		printk(KERN_ERR "SCTP: Failed endpoint_hash alloc.\n");
+		pr_err("Failed endpoint_hash alloc\n");
 		status = -ENOMEM;
 		goto err_ehash_alloc;
 	}
@@ -1238,7 +1240,7 @@ SCTP_STATIC __init int sctp_init(void)
 					__get_free_pages(GFP_ATOMIC, order);
 	} while (!sctp_port_hashtable && --order > 0);
 	if (!sctp_port_hashtable) {
-		printk(KERN_ERR "SCTP: Failed bind hash alloc.");
+		pr_err("Failed bind hash alloc\n");
 		status = -ENOMEM;
 		goto err_bhash_alloc;
 	}
@@ -1247,8 +1249,7 @@ SCTP_STATIC __init int sctp_init(void)
 		INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain);
 	}
 
-	printk(KERN_INFO "SCTP: Hash tables configured "
-			 "(established %d bind %d)\n",
+	pr_info("Hash tables configured (established %d bind %d)\n",
 		sctp_assoc_hashsize, sctp_port_hashsize);
 
 	/* Disable ADDIP by default. */
@@ -1289,8 +1290,7 @@ SCTP_STATIC __init int sctp_init(void)
 
 	/* Initialize the control inode/socket for handling OOTB packets.  */
 	if ((status = sctp_ctl_sock_init())) {
-		printk (KERN_ERR
-			"SCTP: Failed to initialize the SCTP control sock.\n");
+		pr_err("Failed to initialize the SCTP control sock\n");
 		goto err_ctl_sock_init;
 	}
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index bd2a50b482a..2cc46f0962c 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -50,6 +50,8 @@
  * be incorporated into the next SCTP release.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/ip.h>
@@ -1817,7 +1819,7 @@ malformed:
 struct __sctp_missing {
 	__be32 num_missing;
 	__be16 type;
-}  __attribute__((packed));
+}  __packed;
 
 /*
  * Report a missing mandatory parameter.
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index f5e5e27cac5..b21b218d564 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -47,6 +47,8 @@
  * be incorporated into the next SCTP release.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/skbuff.h>
 #include <linux/types.h>
 #include <linux/socket.h>
@@ -1146,26 +1148,23 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 
 	case SCTP_DISPOSITION_VIOLATION:
 		if (net_ratelimit())
-			printk(KERN_ERR "sctp protocol violation state %d "
-			       "chunkid %d\n", state, subtype.chunk);
+			pr_err("protocol violation state %d chunkid %d\n",
+			       state, subtype.chunk);
 		break;
 
 	case SCTP_DISPOSITION_NOT_IMPL:
-		printk(KERN_WARNING "sctp unimplemented feature in state %d, "
-		       "event_type %d, event_id %d\n",
-		       state, event_type, subtype.chunk);
+		pr_warn("unimplemented feature in state %d, event_type %d, event_id %d\n",
+			state, event_type, subtype.chunk);
 		break;
 
 	case SCTP_DISPOSITION_BUG:
-		printk(KERN_ERR "sctp bug in state %d, "
-		       "event_type %d, event_id %d\n",
+		pr_err("bug in state %d, event_type %d, event_id %d\n",
 		       state, event_type, subtype.chunk);
 		BUG();
 		break;
 
 	default:
-		printk(KERN_ERR "sctp impossible disposition %d "
-		       "in state %d, event_type %d, event_id %d\n",
+		pr_err("impossible disposition %d in state %d, event_type %d, event_id %d\n",
 		       status, state, event_type, subtype.chunk);
 		BUG();
 		break;
@@ -1679,8 +1678,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 			sctp_cmd_send_asconf(asoc);
 			break;
 		default:
-			printk(KERN_WARNING "Impossible command: %u, %p\n",
-			       cmd->verb, cmd->obj.ptr);
+			pr_warn("Impossible command: %u, %p\n",
+				cmd->verb, cmd->obj.ptr);
 			break;
 		}
 
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 24b2cd55563..4b4eb7c96bb 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -50,6 +50,8 @@
  * be incorporated into the next SCTP release.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/ip.h>
@@ -1138,18 +1140,16 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep,
 	if (unlikely(!link)) {
 		if (from_addr.sa.sa_family == AF_INET6) {
 			if (net_ratelimit())
-				printk(KERN_WARNING
-				    "%s association %p could not find address %pI6\n",
-				    __func__,
-				    asoc,
-				    &from_addr.v6.sin6_addr);
+				pr_warn("%s association %p could not find address %pI6\n",
+					__func__,
+					asoc,
+					&from_addr.v6.sin6_addr);
 		} else {
 			if (net_ratelimit())
-				printk(KERN_WARNING
-				    "%s association %p could not find address %pI4\n",
-				    __func__,
-				    asoc,
-				    &from_addr.v4.sin_addr.s_addr);
+				pr_warn("%s association %p could not find address %pI4\n",
+					__func__,
+					asoc,
+					&from_addr.v4.sin_addr.s_addr);
 		}
 		return SCTP_DISPOSITION_DISCARD;
 	}
@@ -1232,6 +1232,18 @@ out:
 	return 0;
 }
 
+static bool list_has_sctp_addr(const struct list_head *list,
+			       union sctp_addr *ipaddr)
+{
+	struct sctp_transport *addr;
+
+	list_for_each_entry(addr, list, transports) {
+		if (sctp_cmp_addr_exact(ipaddr, &addr->ipaddr))
+			return true;
+	}
+
+	return false;
+}
 /* A restart is occurring, check to make sure no new addresses
  * are being added as we may be under a takeover attack.
  */
@@ -1240,10 +1252,10 @@ static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc,
 				       struct sctp_chunk *init,
 				       sctp_cmd_seq_t *commands)
 {
-	struct sctp_transport *new_addr, *addr;
-	int found;
+	struct sctp_transport *new_addr;
+	int ret = 1;
 
-	/* Implementor's Guide - Sectin 5.2.2
+	/* Implementor's Guide - Section 5.2.2
 	 * ...
 	 * Before responding the endpoint MUST check to see if the
 	 * unexpected INIT adds new addresses to the association. If new
@@ -1254,31 +1266,19 @@ static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc,
 	/* Search through all current addresses and make sure
 	 * we aren't adding any new ones.
 	 */
-	new_addr = NULL;
-	found = 0;
-
 	list_for_each_entry(new_addr, &new_asoc->peer.transport_addr_list,
-			transports) {
-		found = 0;
-		list_for_each_entry(addr, &asoc->peer.transport_addr_list,
-				transports) {
-			if (sctp_cmp_addr_exact(&new_addr->ipaddr,
-						&addr->ipaddr)) {
-				found = 1;
-				break;
-			}
-		}
-		if (!found)
+			    transports) {
+		if (!list_has_sctp_addr(&asoc->peer.transport_addr_list,
+					&new_addr->ipaddr)) {
+			sctp_sf_send_restart_abort(&new_addr->ipaddr, init,
+						   commands);
+			ret = 0;
 			break;
-	}
-
-	/* If a new address was added, ABORT the sender. */
-	if (!found && new_addr) {
-		sctp_sf_send_restart_abort(&new_addr->ipaddr, init, commands);
+		}
 	}
 
 	/* Return success if all addresses were found. */
-	return found;
+	return ret;
 }
 
 /* Populate the verification/tie tags based on overlapping INIT
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 6d9b3aafcc5..546d4387fb3 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -46,6 +46,8 @@
  * be incorporated into the next SCTP release.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/skbuff.h>
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
@@ -66,15 +68,19 @@ static const sctp_sm_table_entry_t bug = {
 	.name = "sctp_sf_bug"
 };
 
-#define DO_LOOKUP(_max, _type, _table) \
-	if ((event_subtype._type > (_max))) { \
-		printk(KERN_WARNING \
-		       "sctp table %p possible attack:" \
-		       " event %d exceeds max %d\n", \
-		       _table, event_subtype._type, _max); \
-		return &bug; \
-	} \
-	return &_table[event_subtype._type][(int)state];
+#define DO_LOOKUP(_max, _type, _table)					\
+({									\
+	const sctp_sm_table_entry_t *rtn;				\
+									\
+	if ((event_subtype._type > (_max))) {				\
+		pr_warn("table %p possible attack: event %d exceeds max %d\n", \
+			_table, event_subtype._type, _max);		\
+	        rtn = &bug;						\
+	} else								\
+		rtn = &_table[event_subtype._type][(int)state];		\
+									\
+	rtn;								\
+})
 
 const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type,
 						  sctp_state_t state,
@@ -83,21 +89,15 @@ const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type,
 	switch (event_type) {
 	case SCTP_EVENT_T_CHUNK:
 		return sctp_chunk_event_lookup(event_subtype.chunk, state);
-		break;
 	case SCTP_EVENT_T_TIMEOUT:
-		DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout,
-			  timeout_event_table);
-		break;
-
+		return DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout,
+				 timeout_event_table);
 	case SCTP_EVENT_T_OTHER:
-		DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other, other_event_table);
-		break;
-
+		return DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other,
+				 other_event_table);
 	case SCTP_EVENT_T_PRIMITIVE:
-		DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive,
-			  primitive_event_table);
-		break;
-
+		return DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive,
+				 primitive_event_table);
 	default:
 		/* Yikes!  We got an illegal event type.  */
 		return &bug;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ca44917872d..e34ca9cc116 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -57,6 +57,8 @@
  * be incorporated into the next SCTP release.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/wait.h>
@@ -916,6 +918,11 @@ SCTP_STATIC int sctp_setsockopt_bindx(struct sock* sk,
 	/* Walk through the addrs buffer and count the number of addresses. */
 	addr_buf = kaddrs;
 	while (walk_size < addrs_size) {
+		if (walk_size + sizeof(sa_family_t) > addrs_size) {
+			kfree(kaddrs);
+			return -EINVAL;
+		}
+
 		sa_addr = (struct sockaddr *)addr_buf;
 		af = sctp_get_af_specific(sa_addr->sa_family);
 
@@ -1002,9 +1009,13 @@ static int __sctp_connect(struct sock* sk,
 	/* Walk through the addrs buffer and count the number of addresses. */
 	addr_buf = kaddrs;
 	while (walk_size < addrs_size) {
+		if (walk_size + sizeof(sa_family_t) > addrs_size) {
+			err = -EINVAL;
+			goto out_free;
+		}
+
 		sa_addr = (union sctp_addr *)addr_buf;
 		af = sctp_get_af_specific(sa_addr->sa.sa_family);
-		port = ntohs(sa_addr->v4.sin_port);
 
 		/* If the address family is not supported or if this address
 		 * causes the address buffer to overflow return EINVAL.
@@ -1014,6 +1025,8 @@ static int __sctp_connect(struct sock* sk,
 			goto out_free;
 		}
 
+		port = ntohs(sa_addr->v4.sin_port);
+
 		/* Save current address so we can work with it */
 		memcpy(&to, sa_addr, af->sockaddr_len);
 
@@ -2458,9 +2471,8 @@ static int sctp_setsockopt_delayed_ack(struct sock *sk,
 		if (params.sack_delay == 0 && params.sack_freq == 0)
 			return 0;
 	} else if (optlen == sizeof(struct sctp_assoc_value)) {
-		printk(KERN_WARNING "SCTP: Use of struct sctp_assoc_value "
-		       "in delayed_ack socket option deprecated\n");
-		printk(KERN_WARNING "SCTP: Use struct sctp_sack_info instead\n");
+		pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n");
+		pr_warn("Use struct sctp_sack_info instead\n");
 		if (copy_from_user(&params, optval, optlen))
 			return -EFAULT;
 
@@ -2868,10 +2880,8 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
 	int val;
 
 	if (optlen == sizeof(int)) {
-		printk(KERN_WARNING
-		   "SCTP: Use of int in maxseg socket option deprecated\n");
-		printk(KERN_WARNING
-		   "SCTP: Use struct sctp_assoc_value instead\n");
+		pr_warn("Use of int in maxseg socket option deprecated\n");
+		pr_warn("Use struct sctp_assoc_value instead\n");
 		if (copy_from_user(&val, optval, optlen))
 			return -EFAULT;
 		params.assoc_id = 0;
@@ -3121,10 +3131,8 @@ static int sctp_setsockopt_maxburst(struct sock *sk,
 	int assoc_id = 0;
 
 	if (optlen == sizeof(int)) {
-		printk(KERN_WARNING
-		   "SCTP: Use of int in max_burst socket option deprecated\n");
-		printk(KERN_WARNING
-		   "SCTP: Use struct sctp_assoc_value instead\n");
+		pr_warn("Use of int in max_burst socket option deprecated\n");
+		pr_warn("Use struct sctp_assoc_value instead\n");
 		if (copy_from_user(&val, optval, optlen))
 			return -EFAULT;
 	} else if (optlen == sizeof(struct sctp_assoc_value)) {
@@ -3595,7 +3603,40 @@ out:
 /* The SCTP ioctl handler. */
 SCTP_STATIC int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
-	return -ENOIOCTLCMD;
+	int rc = -ENOTCONN;
+
+	sctp_lock_sock(sk);
+
+	/*
+	 * SEQPACKET-style sockets in LISTENING state are valid, for
+	 * SCTP, so only discard TCP-style sockets in LISTENING state.
+	 */
+	if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
+		goto out;
+
+	switch (cmd) {
+	case SIOCINQ: {
+		struct sk_buff *skb;
+		unsigned int amount = 0;
+
+		skb = skb_peek(&sk->sk_receive_queue);
+		if (skb != NULL) {
+			/*
+			 * We will only return the amount of this packet since
+			 * that is all that will be read.
+			 */
+			amount = skb->len;
+		}
+		rc = put_user(amount, (int __user *)arg);
+		break;
+	}
+	default:
+		rc = -ENOIOCTLCMD;
+		break;
+	}
+out:
+	sctp_release_sock(sk);
+	return rc;
 }
 
 /* This is the function which gets called during socket creation to
@@ -3854,7 +3895,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
 	}
 
 out:
-	return (retval);
+	return retval;
 }
 
 
@@ -3910,7 +3951,7 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
 	}
 
 out:
-	return (retval);
+	return retval;
 }
 
 /* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS)
@@ -4281,9 +4322,8 @@ static int sctp_getsockopt_delayed_ack(struct sock *sk, int len,
 		if (copy_from_user(&params, optval, len))
 			return -EFAULT;
 	} else if (len == sizeof(struct sctp_assoc_value)) {
-		printk(KERN_WARNING "SCTP: Use of struct sctp_assoc_value "
-		       "in delayed_ack socket option deprecated\n");
-		printk(KERN_WARNING "SCTP: Use struct sctp_sack_info instead\n");
+		pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n");
+		pr_warn("Use struct sctp_sack_info instead\n");
 		if (copy_from_user(&params, optval, len))
 			return -EFAULT;
 	} else
@@ -4929,10 +4969,8 @@ static int sctp_getsockopt_maxseg(struct sock *sk, int len,
 	struct sctp_association *asoc;
 
 	if (len == sizeof(int)) {
-		printk(KERN_WARNING
-		   "SCTP: Use of int in maxseg socket option deprecated\n");
-		printk(KERN_WARNING
-		   "SCTP: Use struct sctp_assoc_value instead\n");
+		pr_warn("Use of int in maxseg socket option deprecated\n");
+		pr_warn("Use struct sctp_assoc_value instead\n");
 		params.assoc_id = 0;
 	} else if (len >= sizeof(struct sctp_assoc_value)) {
 		len = sizeof(struct sctp_assoc_value);
@@ -5023,10 +5061,8 @@ static int sctp_getsockopt_maxburst(struct sock *sk, int len,
 	struct sctp_association *asoc;
 
 	if (len == sizeof(int)) {
-		printk(KERN_WARNING
-		   "SCTP: Use of int in max_burst socket option deprecated\n");
-		printk(KERN_WARNING
-		   "SCTP: Use struct sctp_assoc_value instead\n");
+		pr_warn("Use of int in max_burst socket option deprecated\n");
+		pr_warn("Use struct sctp_assoc_value instead\n");
 		params.assoc_id = 0;
 	} else if (len >= sizeof(struct sctp_assoc_value)) {
 		len = sizeof(struct sctp_assoc_value);
@@ -5569,7 +5605,7 @@ static int sctp_get_port(struct sock *sk, unsigned short snum)
 	/* Note: sk->sk_num gets filled in if ephemeral port request. */
 	ret = sctp_get_port_local(sk, &addr);
 
-	return (ret ? 1 : 0);
+	return ret ? 1 : 0;
 }
 
 /*
@@ -5586,8 +5622,7 @@ SCTP_STATIC int sctp_listen_start(struct sock *sk, int backlog)
 		tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC);
 		if (IS_ERR(tfm)) {
 			if (net_ratelimit()) {
-				printk(KERN_INFO
-				       "SCTP: failed to load transform for %s: %ld\n",
+				pr_info("failed to load transform for %s: %ld\n",
 					sctp_hmac_alg, PTR_ERR(tfm));
 			}
 			return -ENOSYS;
@@ -5716,13 +5751,12 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
 		mask |= POLLERR;
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLRDHUP;
+		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
 		mask |= POLLHUP;
 
 	/* Is it readable?  Reconsider this code with TCP-style support.  */
-	if (!skb_queue_empty(&sk->sk_receive_queue) ||
-	    (sk->sk_shutdown & RCV_SHUTDOWN))
+	if (!skb_queue_empty(&sk->sk_receive_queue))
 		mask |= POLLIN | POLLRDNORM;
 
 	/* The association is either gone or not ready.  */
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 132046cb82f..d3ae493d234 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -48,6 +48,8 @@
  * be incorporated into the next SCTP release.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/random.h>
@@ -244,10 +246,9 @@ void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
 	struct dst_entry *dst;
 
 	if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) {
-		printk(KERN_WARNING "%s: Reported pmtu %d too low, "
-		       "using default minimum of %d\n",
-		       __func__, pmtu,
-		       SCTP_DEFAULT_MINSEGMENT);
+		pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n",
+			__func__, pmtu,
+			SCTP_DEFAULT_MINSEGMENT);
 		/* Use default minimum segment size and disable
 		 * pmtu discovery on this transport.
 		 */
diff --git a/net/socket.c b/net/socket.c
index 367d5477d00..3ca2fd9e372 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -124,7 +124,7 @@ static int sock_fasync(int fd, struct file *filp, int on);
 static ssize_t sock_sendpage(struct file *file, struct page *page,
 			     int offset, size_t size, loff_t *ppos, int more);
 static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
-			        struct pipe_inode_info *pipe, size_t len,
+				struct pipe_inode_info *pipe, size_t len,
 				unsigned int flags);
 
 /*
@@ -162,7 +162,7 @@ static const struct net_proto_family *net_families[NPROTO] __read_mostly;
  *	Statistics counters of the socket lists
  */
 
-static DEFINE_PER_CPU(int, sockets_in_use) = 0;
+static DEFINE_PER_CPU(int, sockets_in_use);
 
 /*
  * Support routines.
@@ -170,15 +170,6 @@ static DEFINE_PER_CPU(int, sockets_in_use) = 0;
  * divide and look after the messy bits.
  */
 
-#define MAX_SOCK_ADDR	128		/* 108 for Unix domain -
-					   16 for IP, 16 for IPX,
-					   24 for IPv6,
-					   about 80 for AX.25
-					   must be at least one bigger than
-					   the AF_UNIX size (see net/unix/af_unix.c
-					   :unix_mkname()).
-					 */
-
 /**
  *	move_addr_to_kernel	-	copy a socket address into kernel space
  *	@uaddr: Address in user space
@@ -218,8 +209,8 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr)
  *	specified. Zero is returned for a success.
  */
 
-int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uaddr,
-		      int __user *ulen)
+static int move_addr_to_user(struct sockaddr *kaddr, int klen,
+			     void __user *uaddr, int __user *ulen)
 {
 	int err;
 	int len;
@@ -309,24 +300,22 @@ static int init_inodecache(void)
 }
 
 static const struct super_operations sockfs_ops = {
-	.alloc_inode =	sock_alloc_inode,
-	.destroy_inode =sock_destroy_inode,
-	.statfs =	simple_statfs,
+	.alloc_inode	= sock_alloc_inode,
+	.destroy_inode	= sock_destroy_inode,
+	.statfs		= simple_statfs,
 };
 
-static int sockfs_get_sb(struct file_system_type *fs_type,
-			 int flags, const char *dev_name, void *data,
-			 struct vfsmount *mnt)
+static struct dentry *sockfs_mount(struct file_system_type *fs_type,
+			 int flags, const char *dev_name, void *data)
 {
-	return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC,
-			     mnt);
+	return mount_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC);
 }
 
 static struct vfsmount *sock_mnt __read_mostly;
 
 static struct file_system_type sock_fs_type = {
 	.name =		"sockfs",
-	.get_sb =	sockfs_get_sb,
+	.mount =	sockfs_mount,
 	.kill_sb =	kill_anon_super,
 };
 
@@ -386,7 +375,7 @@ static int sock_alloc_file(struct socket *sock, struct file **f, int flags)
 		  &socket_file_ops);
 	if (unlikely(!file)) {
 		/* drop dentry, keep inode */
-		atomic_inc(&path.dentry->d_inode->i_count);
+		ihold(path.dentry->d_inode);
 		path_put(&path);
 		put_unused_fd(fd);
 		return -ENFILE;
@@ -411,6 +400,7 @@ int sock_map_fd(struct socket *sock, int flags)
 
 	return fd;
 }
+EXPORT_SYMBOL(sock_map_fd);
 
 static struct socket *sock_from_file(struct file *file, int *err)
 {
@@ -422,7 +412,7 @@ static struct socket *sock_from_file(struct file *file, int *err)
 }
 
 /**
- *	sockfd_lookup	- 	Go from a file number to its socket slot
+ *	sockfd_lookup - Go from a file number to its socket slot
  *	@fd: file handle
  *	@err: pointer to an error code return
  *
@@ -450,6 +440,7 @@ struct socket *sockfd_lookup(int fd, int *err)
 		fput(file);
 	return sock;
 }
+EXPORT_SYMBOL(sockfd_lookup);
 
 static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
 {
@@ -487,6 +478,7 @@ static struct socket *sock_alloc(void)
 	sock = SOCKET_I(inode);
 
 	kmemcheck_annotate_bitfield(sock, type);
+	inode->i_ino = get_next_ino();
 	inode->i_mode = S_IFSOCK | S_IRWXUGO;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
@@ -509,6 +501,7 @@ static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
 const struct file_operations bad_sock_fops = {
 	.owner = THIS_MODULE,
 	.open = sock_no_open,
+	.llseek = noop_llseek,
 };
 
 /**
@@ -540,15 +533,15 @@ void sock_release(struct socket *sock)
 	}
 	sock->file = NULL;
 }
+EXPORT_SYMBOL(sock_release);
 
-int sock_tx_timestamp(struct msghdr *msg, struct sock *sk,
-		      union skb_shared_tx *shtx)
+int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags)
 {
-	shtx->flags = 0;
+	*tx_flags = 0;
 	if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
-		shtx->hardware = 1;
+		*tx_flags |= SKBTX_HW_TSTAMP;
 	if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
-		shtx->software = 1;
+		*tx_flags |= SKBTX_SW_TSTAMP;
 	return 0;
 }
 EXPORT_SYMBOL(sock_tx_timestamp);
@@ -586,6 +579,7 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 		ret = wait_on_sync_kiocb(&iocb);
 	return ret;
 }
+EXPORT_SYMBOL(sock_sendmsg);
 
 int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
 		   struct kvec *vec, size_t num, size_t size)
@@ -604,6 +598,7 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
 	set_fs(oldfs);
 	return result;
 }
+EXPORT_SYMBOL(kernel_sendmsg);
 
 static int ktime2ts(ktime_t kt, struct timespec *ts)
 {
@@ -664,10 +659,10 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 		put_cmsg(msg, SOL_SOCKET,
 			 SCM_TIMESTAMPING, sizeof(ts), &ts);
 }
-
 EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
 
-inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
+static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
+				   struct sk_buff *skb)
 {
 	if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && skb->dropcount)
 		put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL,
@@ -720,6 +715,7 @@ int sock_recvmsg(struct socket *sock, struct msghdr *msg,
 		ret = wait_on_sync_kiocb(&iocb);
 	return ret;
 }
+EXPORT_SYMBOL(sock_recvmsg);
 
 static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
 			      size_t size, int flags)
@@ -752,6 +748,7 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
 	set_fs(oldfs);
 	return result;
 }
+EXPORT_SYMBOL(kernel_recvmsg);
 
 static void sock_aio_dtor(struct kiocb *iocb)
 {
@@ -774,7 +771,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page,
 }
 
 static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
-			        struct pipe_inode_info *pipe, size_t len,
+				struct pipe_inode_info *pipe, size_t len,
 				unsigned int flags)
 {
 	struct socket *sock = file->private_data;
@@ -887,7 +884,7 @@ static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
  */
 
 static DEFINE_MUTEX(br_ioctl_mutex);
-static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg) = NULL;
+static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg);
 
 void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
 {
@@ -895,7 +892,6 @@ void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
 	br_ioctl_hook = hook;
 	mutex_unlock(&br_ioctl_mutex);
 }
-
 EXPORT_SYMBOL(brioctl_set);
 
 static DEFINE_MUTEX(vlan_ioctl_mutex);
@@ -907,7 +903,6 @@ void vlan_ioctl_set(int (*hook) (struct net *, void __user *))
 	vlan_ioctl_hook = hook;
 	mutex_unlock(&vlan_ioctl_mutex);
 }
-
 EXPORT_SYMBOL(vlan_ioctl_set);
 
 static DEFINE_MUTEX(dlci_ioctl_mutex);
@@ -919,7 +914,6 @@ void dlci_ioctl_set(int (*hook) (unsigned int, void __user *))
 	dlci_ioctl_hook = hook;
 	mutex_unlock(&dlci_ioctl_mutex);
 }
-
 EXPORT_SYMBOL(dlci_ioctl_set);
 
 static long sock_do_ioctl(struct net *net, struct socket *sock,
@@ -1047,6 +1041,7 @@ out_release:
 	sock = NULL;
 	goto out;
 }
+EXPORT_SYMBOL(sock_create_lite);
 
 /* No kernel lock held - perfect */
 static unsigned int sock_poll(struct file *file, poll_table *wait)
@@ -1147,8 +1142,9 @@ call_kill:
 	rcu_read_unlock();
 	return 0;
 }
+EXPORT_SYMBOL(sock_wake_async);
 
-static int __sock_create(struct net *net, int family, int type, int protocol,
+int __sock_create(struct net *net, int family, int type, int protocol,
 			 struct socket **res, int kern)
 {
 	int err;
@@ -1260,16 +1256,19 @@ out_release:
 	rcu_read_unlock();
 	goto out_sock_release;
 }
+EXPORT_SYMBOL(__sock_create);
 
 int sock_create(int family, int type, int protocol, struct socket **res)
 {
 	return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
 }
+EXPORT_SYMBOL(sock_create);
 
 int sock_create_kern(int family, int type, int protocol, struct socket **res)
 {
 	return __sock_create(&init_net, family, type, protocol, res, 1);
 }
+EXPORT_SYMBOL(sock_create_kern);
 
 SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
 {
@@ -1474,7 +1473,8 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
 		goto out;
 
 	err = -ENFILE;
-	if (!(newsock = sock_alloc()))
+	newsock = sock_alloc();
+	if (!newsock)
 		goto out_put;
 
 	newsock->type = sock->type;
@@ -1652,6 +1652,8 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
 	struct iovec iov;
 	int fput_needed;
 
+	if (len > INT_MAX)
+		len = INT_MAX;
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
 	if (!sock)
 		goto out;
@@ -1709,6 +1711,8 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
 	int err, err2;
 	int fput_needed;
 
+	if (size > INT_MAX)
+		size = INT_MAX;
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
 	if (!sock)
 		goto out;
@@ -1861,8 +1865,7 @@ SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned, flags)
 	if (MSG_CMSG_COMPAT & flags) {
 		if (get_compat_msghdr(&msg_sys, msg_compat))
 			return -EFAULT;
-	}
-	else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr)))
+	} else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr)))
 		return -EFAULT;
 
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
@@ -1921,7 +1924,8 @@ SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned, flags)
 		 * Afterwards, it will be a kernel pointer. Thus the compiler-assisted
 		 * checking falls down on this.
 		 */
-		if (copy_from_user(ctl_buf, (void __user *)msg_sys.msg_control,
+		if (copy_from_user(ctl_buf,
+				   (void __user __force *)msg_sys.msg_control,
 				   ctl_len))
 			goto out_freectl;
 		msg_sys.msg_control = ctl_buf;
@@ -1964,8 +1968,7 @@ static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg,
 	if (MSG_CMSG_COMPAT & flags) {
 		if (get_compat_msghdr(msg_sys, msg_compat))
 			return -EFAULT;
-	}
-	else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr)))
+	} else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr)))
 		return -EFAULT;
 
 	err = -EMSGSIZE;
@@ -2191,10 +2194,10 @@ SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
 /* Argument list sizes for sys_socketcall */
 #define AL(x) ((x) * sizeof(unsigned long))
 static const unsigned char nargs[20] = {
-	AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
-	AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
-	AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
-	AL(4),AL(5)
+	AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),
+	AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),
+	AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),
+	AL(4), AL(5)
 };
 
 #undef AL
@@ -2340,6 +2343,7 @@ int sock_register(const struct net_proto_family *ops)
 	printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
 	return err;
 }
+EXPORT_SYMBOL(sock_register);
 
 /**
  *	sock_unregister - remove a protocol handler
@@ -2366,6 +2370,7 @@ void sock_unregister(int family)
 
 	printk(KERN_INFO "NET: Unregistered protocol family %d\n", family);
 }
+EXPORT_SYMBOL(sock_unregister);
 
 static int __init sock_init(void)
 {
@@ -2395,6 +2400,10 @@ static int __init sock_init(void)
 	netfilter_init();
 #endif
 
+#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
+	skb_timestamping_init();
+#endif
+
 	return 0;
 }
 
@@ -2490,13 +2499,13 @@ static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32)
 		ifc.ifc_req = NULL;
 		uifc = compat_alloc_user_space(sizeof(struct ifconf));
 	} else {
-		size_t len =((ifc32.ifc_len / sizeof (struct compat_ifreq)) + 1) *
-			sizeof (struct ifreq);
+		size_t len = ((ifc32.ifc_len / sizeof(struct compat_ifreq)) + 1) *
+			sizeof(struct ifreq);
 		uifc = compat_alloc_user_space(sizeof(struct ifconf) + len);
 		ifc.ifc_len = len;
 		ifr = ifc.ifc_req = (void __user *)(uifc + 1);
 		ifr32 = compat_ptr(ifc32.ifcbuf);
-		for (i = 0; i < ifc32.ifc_len; i += sizeof (struct compat_ifreq)) {
+		for (i = 0; i < ifc32.ifc_len; i += sizeof(struct compat_ifreq)) {
 			if (copy_in_user(ifr, ifr32, sizeof(struct compat_ifreq)))
 				return -EFAULT;
 			ifr++;
@@ -2516,9 +2525,9 @@ static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32)
 	ifr = ifc.ifc_req;
 	ifr32 = compat_ptr(ifc32.ifcbuf);
 	for (i = 0, j = 0;
-             i + sizeof (struct compat_ifreq) <= ifc32.ifc_len && j < ifc.ifc_len;
-	     i += sizeof (struct compat_ifreq), j += sizeof (struct ifreq)) {
-		if (copy_in_user(ifr32, ifr, sizeof (struct compat_ifreq)))
+	     i + sizeof(struct compat_ifreq) <= ifc32.ifc_len && j < ifc.ifc_len;
+	     i += sizeof(struct compat_ifreq), j += sizeof(struct ifreq)) {
+		if (copy_in_user(ifr32, ifr, sizeof(struct compat_ifreq)))
 			return -EFAULT;
 		ifr32++;
 		ifr++;
@@ -2567,7 +2576,7 @@ static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32
 	compat_uptr_t uptr32;
 	struct ifreq __user *uifr;
 
-	uifr = compat_alloc_user_space(sizeof (*uifr));
+	uifr = compat_alloc_user_space(sizeof(*uifr));
 	if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq)))
 		return -EFAULT;
 
@@ -2601,9 +2610,9 @@ static int bond_ioctl(struct net *net, unsigned int cmd,
 			return -EFAULT;
 
 		old_fs = get_fs();
-		set_fs (KERNEL_DS);
+		set_fs(KERNEL_DS);
 		err = dev_ioctl(net, cmd, &kifr);
-		set_fs (old_fs);
+		set_fs(old_fs);
 
 		return err;
 	case SIOCBONDSLAVEINFOQUERY:
@@ -2710,9 +2719,9 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
 		return -EFAULT;
 
 	old_fs = get_fs();
-	set_fs (KERNEL_DS);
+	set_fs(KERNEL_DS);
 	err = dev_ioctl(net, cmd, (void __user *)&ifr);
-	set_fs (old_fs);
+	set_fs(old_fs);
 
 	if (cmd == SIOCGIFMAP && !err) {
 		err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name));
@@ -2734,7 +2743,7 @@ static int compat_siocshwtstamp(struct net *net, struct compat_ifreq __user *uif
 	compat_uptr_t uptr32;
 	struct ifreq __user *uifr;
 
-	uifr = compat_alloc_user_space(sizeof (*uifr));
+	uifr = compat_alloc_user_space(sizeof(*uifr));
 	if (copy_in_user(uifr, uifr32, sizeof(struct compat_ifreq)))
 		return -EFAULT;
 
@@ -2750,20 +2759,20 @@ static int compat_siocshwtstamp(struct net *net, struct compat_ifreq __user *uif
 }
 
 struct rtentry32 {
-	u32   		rt_pad1;
+	u32		rt_pad1;
 	struct sockaddr rt_dst;         /* target address               */
 	struct sockaddr rt_gateway;     /* gateway addr (RTF_GATEWAY)   */
 	struct sockaddr rt_genmask;     /* target network mask (IP)     */
-	unsigned short  rt_flags;
-	short           rt_pad2;
-	u32   		rt_pad3;
-	unsigned char   rt_tos;
-	unsigned char   rt_class;
-	short           rt_pad4;
-	short           rt_metric;      /* +1 for binary compatibility! */
+	unsigned short	rt_flags;
+	short		rt_pad2;
+	u32		rt_pad3;
+	unsigned char	rt_tos;
+	unsigned char	rt_class;
+	short		rt_pad4;
+	short		rt_metric;      /* +1 for binary compatibility! */
 	/* char * */ u32 rt_dev;        /* forcing the device at add    */
-	u32   		rt_mtu;         /* per route MTU/Window         */
-	u32   		rt_window;      /* Window clamping              */
+	u32		rt_mtu;         /* per route MTU/Window         */
+	u32		rt_window;      /* Window clamping              */
 	unsigned short  rt_irtt;        /* Initial RTT                  */
 };
 
@@ -2793,29 +2802,29 @@ static int routing_ioctl(struct net *net, struct socket *sock,
 
 	if (sock && sock->sk && sock->sk->sk_family == AF_INET6) { /* ipv6 */
 		struct in6_rtmsg32 __user *ur6 = argp;
-		ret = copy_from_user (&r6.rtmsg_dst, &(ur6->rtmsg_dst),
+		ret = copy_from_user(&r6.rtmsg_dst, &(ur6->rtmsg_dst),
 			3 * sizeof(struct in6_addr));
-		ret |= __get_user (r6.rtmsg_type, &(ur6->rtmsg_type));
-		ret |= __get_user (r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len));
-		ret |= __get_user (r6.rtmsg_src_len, &(ur6->rtmsg_src_len));
-		ret |= __get_user (r6.rtmsg_metric, &(ur6->rtmsg_metric));
-		ret |= __get_user (r6.rtmsg_info, &(ur6->rtmsg_info));
-		ret |= __get_user (r6.rtmsg_flags, &(ur6->rtmsg_flags));
-		ret |= __get_user (r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex));
+		ret |= __get_user(r6.rtmsg_type, &(ur6->rtmsg_type));
+		ret |= __get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len));
+		ret |= __get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len));
+		ret |= __get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric));
+		ret |= __get_user(r6.rtmsg_info, &(ur6->rtmsg_info));
+		ret |= __get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags));
+		ret |= __get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex));
 
 		r = (void *) &r6;
 	} else { /* ipv4 */
 		struct rtentry32 __user *ur4 = argp;
-		ret = copy_from_user (&r4.rt_dst, &(ur4->rt_dst),
+		ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst),
 					3 * sizeof(struct sockaddr));
-		ret |= __get_user (r4.rt_flags, &(ur4->rt_flags));
-		ret |= __get_user (r4.rt_metric, &(ur4->rt_metric));
-		ret |= __get_user (r4.rt_mtu, &(ur4->rt_mtu));
-		ret |= __get_user (r4.rt_window, &(ur4->rt_window));
-		ret |= __get_user (r4.rt_irtt, &(ur4->rt_irtt));
-		ret |= __get_user (rtdev, &(ur4->rt_dev));
+		ret |= __get_user(r4.rt_flags, &(ur4->rt_flags));
+		ret |= __get_user(r4.rt_metric, &(ur4->rt_metric));
+		ret |= __get_user(r4.rt_mtu, &(ur4->rt_mtu));
+		ret |= __get_user(r4.rt_window, &(ur4->rt_window));
+		ret |= __get_user(r4.rt_irtt, &(ur4->rt_irtt));
+		ret |= __get_user(rtdev, &(ur4->rt_dev));
 		if (rtdev) {
-			ret |= copy_from_user (devname, compat_ptr(rtdev), 15);
+			ret |= copy_from_user(devname, compat_ptr(rtdev), 15);
 			r4.rt_dev = devname; devname[15] = 0;
 		} else
 			r4.rt_dev = NULL;
@@ -2828,9 +2837,9 @@ static int routing_ioctl(struct net *net, struct socket *sock,
 		goto out;
 	}
 
-	set_fs (KERNEL_DS);
+	set_fs(KERNEL_DS);
 	ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r);
-	set_fs (old_fs);
+	set_fs(old_fs);
 
 out:
 	return ret;
@@ -2993,11 +3002,13 @@ int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
 {
 	return sock->ops->bind(sock, addr, addrlen);
 }
+EXPORT_SYMBOL(kernel_bind);
 
 int kernel_listen(struct socket *sock, int backlog)
 {
 	return sock->ops->listen(sock, backlog);
 }
+EXPORT_SYMBOL(kernel_listen);
 
 int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
 {
@@ -3022,56 +3033,70 @@ int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
 done:
 	return err;
 }
+EXPORT_SYMBOL(kernel_accept);
 
 int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
 		   int flags)
 {
 	return sock->ops->connect(sock, addr, addrlen, flags);
 }
+EXPORT_SYMBOL(kernel_connect);
 
 int kernel_getsockname(struct socket *sock, struct sockaddr *addr,
 			 int *addrlen)
 {
 	return sock->ops->getname(sock, addr, addrlen, 0);
 }
+EXPORT_SYMBOL(kernel_getsockname);
 
 int kernel_getpeername(struct socket *sock, struct sockaddr *addr,
 			 int *addrlen)
 {
 	return sock->ops->getname(sock, addr, addrlen, 1);
 }
+EXPORT_SYMBOL(kernel_getpeername);
 
 int kernel_getsockopt(struct socket *sock, int level, int optname,
 			char *optval, int *optlen)
 {
 	mm_segment_t oldfs = get_fs();
+	char __user *uoptval;
+	int __user *uoptlen;
 	int err;
 
+	uoptval = (char __user __force *) optval;
+	uoptlen = (int __user __force *) optlen;
+
 	set_fs(KERNEL_DS);
 	if (level == SOL_SOCKET)
-		err = sock_getsockopt(sock, level, optname, optval, optlen);
+		err = sock_getsockopt(sock, level, optname, uoptval, uoptlen);
 	else
-		err = sock->ops->getsockopt(sock, level, optname, optval,
-					    optlen);
+		err = sock->ops->getsockopt(sock, level, optname, uoptval,
+					    uoptlen);
 	set_fs(oldfs);
 	return err;
 }
+EXPORT_SYMBOL(kernel_getsockopt);
 
 int kernel_setsockopt(struct socket *sock, int level, int optname,
 			char *optval, unsigned int optlen)
 {
 	mm_segment_t oldfs = get_fs();
+	char __user *uoptval;
 	int err;
 
+	uoptval = (char __user __force *) optval;
+
 	set_fs(KERNEL_DS);
 	if (level == SOL_SOCKET)
-		err = sock_setsockopt(sock, level, optname, optval, optlen);
+		err = sock_setsockopt(sock, level, optname, uoptval, optlen);
 	else
-		err = sock->ops->setsockopt(sock, level, optname, optval,
+		err = sock->ops->setsockopt(sock, level, optname, uoptval,
 					    optlen);
 	set_fs(oldfs);
 	return err;
 }
+EXPORT_SYMBOL(kernel_setsockopt);
 
 int kernel_sendpage(struct socket *sock, struct page *page, int offset,
 		    size_t size, int flags)
@@ -3083,6 +3108,7 @@ int kernel_sendpage(struct socket *sock, struct page *page, int offset,
 
 	return sock_no_sendpage(sock, page, offset, size, flags);
 }
+EXPORT_SYMBOL(kernel_sendpage);
 
 int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg)
 {
@@ -3095,33 +3121,10 @@ int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg)
 
 	return err;
 }
+EXPORT_SYMBOL(kernel_sock_ioctl);
 
 int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)
 {
 	return sock->ops->shutdown(sock, how);
 }
-
-EXPORT_SYMBOL(sock_create);
-EXPORT_SYMBOL(sock_create_kern);
-EXPORT_SYMBOL(sock_create_lite);
-EXPORT_SYMBOL(sock_map_fd);
-EXPORT_SYMBOL(sock_recvmsg);
-EXPORT_SYMBOL(sock_register);
-EXPORT_SYMBOL(sock_release);
-EXPORT_SYMBOL(sock_sendmsg);
-EXPORT_SYMBOL(sock_unregister);
-EXPORT_SYMBOL(sock_wake_async);
-EXPORT_SYMBOL(sockfd_lookup);
-EXPORT_SYMBOL(kernel_sendmsg);
-EXPORT_SYMBOL(kernel_recvmsg);
-EXPORT_SYMBOL(kernel_bind);
-EXPORT_SYMBOL(kernel_listen);
-EXPORT_SYMBOL(kernel_accept);
-EXPORT_SYMBOL(kernel_connect);
-EXPORT_SYMBOL(kernel_getsockname);
-EXPORT_SYMBOL(kernel_getpeername);
-EXPORT_SYMBOL(kernel_getsockopt);
-EXPORT_SYMBOL(kernel_setsockopt);
-EXPORT_SYMBOL(kernel_sendpage);
-EXPORT_SYMBOL(kernel_sock_ioctl);
 EXPORT_SYMBOL(kernel_sock_shutdown);
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 443c161eb8b..8873fd8ddac 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -18,10 +18,11 @@ config SUNRPC_XPRT_RDMA
 	  If unsure, say N.
 
 config RPCSEC_GSS_KRB5
-	tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
-	depends on SUNRPC && EXPERIMENTAL
+	tristate
+	depends on SUNRPC && CRYPTO
+	prompt "Secure RPC: Kerberos V mechanism" if !(NFS_V4 || NFSD_V4)
+	default y
 	select SUNRPC_GSS
-	select CRYPTO
 	select CRYPTO_MD5
 	select CRYPTO_DES
 	select CRYPTO_CBC
@@ -34,23 +35,4 @@ config RPCSEC_GSS_KRB5
 	  available from http://linux-nfs.org/.  In addition, user-space
 	  Kerberos support should be installed.
 
-	  If unsure, say N.
-
-config RPCSEC_GSS_SPKM3
-	tristate "Secure RPC: SPKM3 mechanism (EXPERIMENTAL)"
-	depends on SUNRPC && EXPERIMENTAL
-	select SUNRPC_GSS
-	select CRYPTO
-	select CRYPTO_MD5
-	select CRYPTO_DES
-	select CRYPTO_CAST5
-	select CRYPTO_CBC
-	help
-	  Choose Y here to enable Secure RPC using the SPKM3 public key
-	  GSS-API mechanism (RFC 2025).
-
-	  Secure RPC calls with SPKM3 require an auxiliary userspace
-	  daemon which may be found in the Linux nfs-utils package
-	  available from http://linux-nfs.org/.
-
-	  If unsure, say N.
+	  If unsure, say Y.
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 8dc47f1d000..afe67849269 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -19,6 +19,15 @@
 # define RPCDBG_FACILITY	RPCDBG_AUTH
 #endif
 
+#define RPC_CREDCACHE_DEFAULT_HASHBITS	(4)
+struct rpc_cred_cache {
+	struct hlist_head	*hashtable;
+	unsigned int		hashbits;
+	spinlock_t		lock;
+};
+
+static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS;
+
 static DEFINE_SPINLOCK(rpc_authflavor_lock);
 static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
 	&authnull_ops,		/* AUTH_NULL */
@@ -29,6 +38,47 @@ static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = {
 static LIST_HEAD(cred_unused);
 static unsigned long number_cred_unused;
 
+#define MAX_HASHTABLE_BITS (14)
+static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
+{
+	unsigned long num;
+	unsigned int nbits;
+	int ret;
+
+	if (!val)
+		goto out_inval;
+	ret = strict_strtoul(val, 0, &num);
+	if (ret == -EINVAL)
+		goto out_inval;
+	nbits = fls(num);
+	if (num > (1U << nbits))
+		nbits++;
+	if (nbits > MAX_HASHTABLE_BITS || nbits < 2)
+		goto out_inval;
+	*(unsigned int *)kp->arg = nbits;
+	return 0;
+out_inval:
+	return -EINVAL;
+}
+
+static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp)
+{
+	unsigned int nbits;
+
+	nbits = *(unsigned int *)kp->arg;
+	return sprintf(buffer, "%u", 1U << nbits);
+}
+
+#define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int);
+
+static struct kernel_param_ops param_ops_hashtbl_sz = {
+	.set = param_set_hashtbl_sz,
+	.get = param_get_hashtbl_sz,
+};
+
+module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644);
+MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size");
+
 static u32
 pseudoflavor_to_flavor(u32 flavor) {
 	if (flavor >= RPC_AUTH_MAXFLAVOR)
@@ -145,16 +195,23 @@ int
 rpcauth_init_credcache(struct rpc_auth *auth)
 {
 	struct rpc_cred_cache *new;
-	int i;
+	unsigned int hashsize;
 
 	new = kmalloc(sizeof(*new), GFP_KERNEL);
 	if (!new)
-		return -ENOMEM;
-	for (i = 0; i < RPC_CREDCACHE_NR; i++)
-		INIT_HLIST_HEAD(&new->hashtable[i]);
+		goto out_nocache;
+	new->hashbits = auth_hashbits;
+	hashsize = 1U << new->hashbits;
+	new->hashtable = kcalloc(hashsize, sizeof(new->hashtable[0]), GFP_KERNEL);
+	if (!new->hashtable)
+		goto out_nohashtbl;
 	spin_lock_init(&new->lock);
 	auth->au_credcache = new;
 	return 0;
+out_nohashtbl:
+	kfree(new);
+out_nocache:
+	return -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(rpcauth_init_credcache);
 
@@ -183,11 +240,12 @@ rpcauth_clear_credcache(struct rpc_cred_cache *cache)
 	LIST_HEAD(free);
 	struct hlist_head *head;
 	struct rpc_cred	*cred;
+	unsigned int hashsize = 1U << cache->hashbits;
 	int		i;
 
 	spin_lock(&rpc_credcache_lock);
 	spin_lock(&cache->lock);
-	for (i = 0; i < RPC_CREDCACHE_NR; i++) {
+	for (i = 0; i < hashsize; i++) {
 		head = &cache->hashtable[i];
 		while (!hlist_empty(head)) {
 			cred = hlist_entry(head->first, struct rpc_cred, cr_hash);
@@ -216,6 +274,7 @@ rpcauth_destroy_credcache(struct rpc_auth *auth)
 	if (cache) {
 		auth->au_credcache = NULL;
 		rpcauth_clear_credcache(cache);
+		kfree(cache->hashtable);
 		kfree(cache);
 	}
 }
@@ -297,7 +356,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 			*entry, *new;
 	unsigned int nr;
 
-	nr = hash_long(acred->uid, RPC_CREDCACHE_HASHBITS);
+	nr = hash_long(acred->uid, cache->hashbits);
 
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(entry, pos, &cache->hashtable[nr], cr_hash) {
@@ -390,16 +449,16 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
 }
 EXPORT_SYMBOL_GPL(rpcauth_init_cred);
 
-void
+struct rpc_cred *
 rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags)
 {
-	task->tk_msg.rpc_cred = get_rpccred(cred);
 	dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid,
 			cred->cr_auth->au_ops->au_name, cred);
+	return get_rpccred(cred);
 }
 EXPORT_SYMBOL_GPL(rpcauth_generic_bind_cred);
 
-static void
+static struct rpc_cred *
 rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags)
 {
 	struct rpc_auth *auth = task->tk_client->cl_auth;
@@ -407,45 +466,43 @@ rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags)
 		.uid = 0,
 		.gid = 0,
 	};
-	struct rpc_cred *ret;
 
 	dprintk("RPC: %5u looking up %s cred\n",
 		task->tk_pid, task->tk_client->cl_auth->au_ops->au_name);
-	ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags);
-	if (!IS_ERR(ret))
-		task->tk_msg.rpc_cred = ret;
-	else
-		task->tk_status = PTR_ERR(ret);
+	return auth->au_ops->lookup_cred(auth, &acred, lookupflags);
 }
 
-static void
+static struct rpc_cred *
 rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags)
 {
 	struct rpc_auth *auth = task->tk_client->cl_auth;
-	struct rpc_cred *ret;
 
 	dprintk("RPC: %5u looking up %s cred\n",
 		task->tk_pid, auth->au_ops->au_name);
-	ret = rpcauth_lookupcred(auth, lookupflags);
-	if (!IS_ERR(ret))
-		task->tk_msg.rpc_cred = ret;
-	else
-		task->tk_status = PTR_ERR(ret);
+	return rpcauth_lookupcred(auth, lookupflags);
 }
 
-void
+static int
 rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags)
 {
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_cred *new;
 	int lookupflags = 0;
 
 	if (flags & RPC_TASK_ASYNC)
 		lookupflags |= RPCAUTH_LOOKUP_NEW;
 	if (cred != NULL)
-		cred->cr_ops->crbind(task, cred, lookupflags);
+		new = cred->cr_ops->crbind(task, cred, lookupflags);
 	else if (flags & RPC_TASK_ROOTCREDS)
-		rpcauth_bind_root_cred(task, lookupflags);
+		new = rpcauth_bind_root_cred(task, lookupflags);
 	else
-		rpcauth_bind_new_cred(task, lookupflags);
+		new = rpcauth_bind_new_cred(task, lookupflags);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+	if (req->rq_cred != NULL)
+		put_rpccred(req->rq_cred);
+	req->rq_cred = new;
+	return 0;
 }
 
 void
@@ -484,22 +541,10 @@ out_nodestroy:
 }
 EXPORT_SYMBOL_GPL(put_rpccred);
 
-void
-rpcauth_unbindcred(struct rpc_task *task)
-{
-	struct rpc_cred	*cred = task->tk_msg.rpc_cred;
-
-	dprintk("RPC: %5u releasing %s cred %p\n",
-		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
-
-	put_rpccred(cred);
-	task->tk_msg.rpc_cred = NULL;
-}
-
 __be32 *
 rpcauth_marshcred(struct rpc_task *task, __be32 *p)
 {
-	struct rpc_cred	*cred = task->tk_msg.rpc_cred;
+	struct rpc_cred	*cred = task->tk_rqstp->rq_cred;
 
 	dprintk("RPC: %5u marshaling %s cred %p\n",
 		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
@@ -510,7 +555,7 @@ rpcauth_marshcred(struct rpc_task *task, __be32 *p)
 __be32 *
 rpcauth_checkverf(struct rpc_task *task, __be32 *p)
 {
-	struct rpc_cred	*cred = task->tk_msg.rpc_cred;
+	struct rpc_cred	*cred = task->tk_rqstp->rq_cred;
 
 	dprintk("RPC: %5u validating %s cred %p\n",
 		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
@@ -522,7 +567,7 @@ int
 rpcauth_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp,
 		__be32 *data, void *obj)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 
 	dprintk("RPC: %5u using %s cred %p to wrap rpc data\n",
 			task->tk_pid, cred->cr_ops->cr_name, cred);
@@ -536,7 +581,7 @@ int
 rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp,
 		__be32 *data, void *obj)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 
 	dprintk("RPC: %5u using %s cred %p to unwrap rpc data\n",
 			task->tk_pid, cred->cr_ops->cr_name, cred);
@@ -550,13 +595,21 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp,
 int
 rpcauth_refreshcred(struct rpc_task *task)
 {
-	struct rpc_cred	*cred = task->tk_msg.rpc_cred;
+	struct rpc_cred	*cred;
 	int err;
 
+	cred = task->tk_rqstp->rq_cred;
+	if (cred == NULL) {
+		err = rpcauth_bindcred(task, task->tk_msg.rpc_cred, task->tk_flags);
+		if (err < 0)
+			goto out;
+		cred = task->tk_rqstp->rq_cred;
+	};
 	dprintk("RPC: %5u refreshing %s cred %p\n",
 		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
 
 	err = cred->cr_ops->crrefresh(task);
+out:
 	if (err < 0)
 		task->tk_status = err;
 	return err;
@@ -565,7 +618,7 @@ rpcauth_refreshcred(struct rpc_task *task)
 void
 rpcauth_invalcred(struct rpc_task *task)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 
 	dprintk("RPC: %5u invalidating %s cred %p\n",
 		task->tk_pid, cred->cr_auth->au_ops->au_name, cred);
@@ -576,7 +629,7 @@ rpcauth_invalcred(struct rpc_task *task)
 int
 rpcauth_uptodatecred(struct rpc_task *task)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 
 	return cred == NULL ||
 		test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0;
@@ -587,14 +640,27 @@ static struct shrinker rpc_cred_shrinker = {
 	.seeks = DEFAULT_SEEKS,
 };
 
-void __init rpcauth_init_module(void)
+int __init rpcauth_init_module(void)
 {
-	rpc_init_authunix();
-	rpc_init_generic_auth();
+	int err;
+
+	err = rpc_init_authunix();
+	if (err < 0)
+		goto out1;
+	err = rpc_init_generic_auth();
+	if (err < 0)
+		goto out2;
 	register_shrinker(&rpc_cred_shrinker);
+	return 0;
+out2:
+	rpc_destroy_authunix();
+out1:
+	return err;
 }
 
-void __exit rpcauth_remove_module(void)
+void rpcauth_remove_module(void)
 {
+	rpc_destroy_authunix();
+	rpc_destroy_generic_auth();
 	unregister_shrinker(&rpc_cred_shrinker);
 }
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 8f623b0f03d..e010a015d99 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -27,7 +27,6 @@ struct generic_cred {
 };
 
 static struct rpc_auth generic_auth;
-static struct rpc_cred_cache generic_cred_cache;
 static const struct rpc_credops generic_credops;
 
 /*
@@ -55,18 +54,13 @@ struct rpc_cred *rpc_lookup_machine_cred(void)
 }
 EXPORT_SYMBOL_GPL(rpc_lookup_machine_cred);
 
-static void
-generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags)
+static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
+		struct rpc_cred *cred, int lookupflags)
 {
 	struct rpc_auth *auth = task->tk_client->cl_auth;
 	struct auth_cred *acred = &container_of(cred, struct generic_cred, gc_base)->acred;
-	struct rpc_cred *ret;
 
-	ret = auth->au_ops->lookup_cred(auth, acred, lookupflags);
-	if (!IS_ERR(ret))
-		task->tk_msg.rpc_cred = ret;
-	else
-		task->tk_status = PTR_ERR(ret);
+	return auth->au_ops->lookup_cred(auth, acred, lookupflags);
 }
 
 /*
@@ -159,20 +153,16 @@ out_nomatch:
 	return 0;
 }
 
-void __init rpc_init_generic_auth(void)
+int __init rpc_init_generic_auth(void)
 {
-	spin_lock_init(&generic_cred_cache.lock);
+	return rpcauth_init_credcache(&generic_auth);
 }
 
-void __exit rpc_destroy_generic_auth(void)
+void rpc_destroy_generic_auth(void)
 {
-	rpcauth_clear_credcache(&generic_cred_cache);
+	rpcauth_destroy_credcache(&generic_auth);
 }
 
-static struct rpc_cred_cache generic_cred_cache = {
-	{{ NULL, },},
-};
-
 static const struct rpc_authops generic_auth_ops = {
 	.owner = THIS_MODULE,
 	.au_name = "Generic",
@@ -183,7 +173,6 @@ static const struct rpc_authops generic_auth_ops = {
 static struct rpc_auth generic_auth = {
 	.au_ops = &generic_auth_ops,
 	.au_count = ATOMIC_INIT(0),
-	.au_credcache = &generic_cred_cache,
 };
 
 static const struct rpc_credops generic_credops = {
diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile
index 74a231735f6..7350d86a32e 100644
--- a/net/sunrpc/auth_gss/Makefile
+++ b/net/sunrpc/auth_gss/Makefile
@@ -11,8 +11,3 @@ obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o
 
 rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \
 	gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o gss_krb5_keys.o
-
-obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o
-
-rpcsec_gss_spkm3-objs := gss_spkm3_mech.o gss_spkm3_seal.o gss_spkm3_unseal.o \
-	gss_spkm3_token.o
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 8da2a0e6857..3835ce35e22 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -373,7 +373,7 @@ gss_handle_downcall_result(struct gss_cred *gss_cred, struct gss_upcall_msg *gss
 static void
 gss_upcall_callback(struct rpc_task *task)
 {
-	struct gss_cred *gss_cred = container_of(task->tk_msg.rpc_cred,
+	struct gss_cred *gss_cred = container_of(task->tk_rqstp->rq_cred,
 			struct gss_cred, gc_base);
 	struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall;
 	struct inode *inode = &gss_msg->inode->vfs_inode;
@@ -502,7 +502,7 @@ static void warn_gssd(void)
 static inline int
 gss_refresh_upcall(struct rpc_task *task)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	struct gss_auth *gss_auth = container_of(cred->cr_auth,
 			struct gss_auth, rpc_auth);
 	struct gss_cred *gss_cred = container_of(cred,
@@ -745,17 +745,18 @@ gss_pipe_release(struct inode *inode)
 	struct rpc_inode *rpci = RPC_I(inode);
 	struct gss_upcall_msg *gss_msg;
 
+restart:
 	spin_lock(&inode->i_lock);
-	while (!list_empty(&rpci->in_downcall)) {
+	list_for_each_entry(gss_msg, &rpci->in_downcall, list) {
 
-		gss_msg = list_entry(rpci->in_downcall.next,
-				struct gss_upcall_msg, list);
+		if (!list_empty(&gss_msg->msg.list))
+			continue;
 		gss_msg->msg.errno = -EPIPE;
 		atomic_inc(&gss_msg->count);
 		__gss_unhash_msg(gss_msg);
 		spin_unlock(&inode->i_lock);
 		gss_release_msg(gss_msg);
-		spin_lock(&inode->i_lock);
+		goto restart;
 	}
 	spin_unlock(&inode->i_lock);
 
@@ -928,6 +929,7 @@ gss_do_free_ctx(struct gss_cl_ctx *ctx)
 {
 	dprintk("RPC:       gss_free_ctx\n");
 
+	gss_delete_sec_context(&ctx->gc_gss_ctx);
 	kfree(ctx->gc_wire_ctx.data);
 	kfree(ctx);
 }
@@ -942,13 +944,7 @@ gss_free_ctx_callback(struct rcu_head *head)
 static void
 gss_free_ctx(struct gss_cl_ctx *ctx)
 {
-	struct gss_ctx *gc_gss_ctx;
-
-	gc_gss_ctx = rcu_dereference(ctx->gc_gss_ctx);
-	rcu_assign_pointer(ctx->gc_gss_ctx, NULL);
 	call_rcu(&ctx->gc_rcu, gss_free_ctx_callback);
-	if (gc_gss_ctx)
-		gss_delete_sec_context(&gc_gss_ctx);
 }
 
 static void
@@ -1054,7 +1050,7 @@ gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
 out:
 	if (acred->machine_cred != gss_cred->gc_machine_cred)
 		return 0;
-	return (rc->cr_uid == acred->uid);
+	return rc->cr_uid == acred->uid;
 }
 
 /*
@@ -1064,12 +1060,12 @@ out:
 static __be32 *
 gss_marshal(struct rpc_task *task, __be32 *p)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_rqst *req = task->tk_rqstp;
+	struct rpc_cred *cred = req->rq_cred;
 	struct gss_cred	*gss_cred = container_of(cred, struct gss_cred,
 						 gc_base);
 	struct gss_cl_ctx	*ctx = gss_cred_get_ctx(cred);
 	__be32		*cred_len;
-	struct rpc_rqst *req = task->tk_rqstp;
 	u32             maj_stat = 0;
 	struct xdr_netobj mic;
 	struct kvec	iov;
@@ -1119,7 +1115,7 @@ out_put_ctx:
 
 static int gss_renew_cred(struct rpc_task *task)
 {
-	struct rpc_cred *oldcred = task->tk_msg.rpc_cred;
+	struct rpc_cred *oldcred = task->tk_rqstp->rq_cred;
 	struct gss_cred *gss_cred = container_of(oldcred,
 						 struct gss_cred,
 						 gc_base);
@@ -1133,7 +1129,7 @@ static int gss_renew_cred(struct rpc_task *task)
 	new = gss_lookup_cred(auth, &acred, RPCAUTH_LOOKUP_NEW);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
-	task->tk_msg.rpc_cred = new;
+	task->tk_rqstp->rq_cred = new;
 	put_rpccred(oldcred);
 	return 0;
 }
@@ -1161,7 +1157,7 @@ static int gss_cred_is_negative_entry(struct rpc_cred *cred)
 static int
 gss_refresh(struct rpc_task *task)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	int ret = 0;
 
 	if (gss_cred_is_negative_entry(cred))
@@ -1172,7 +1168,7 @@ gss_refresh(struct rpc_task *task)
 		ret = gss_renew_cred(task);
 		if (ret < 0)
 			goto out;
-		cred = task->tk_msg.rpc_cred;
+		cred = task->tk_rqstp->rq_cred;
 	}
 
 	if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags))
@@ -1191,7 +1187,7 @@ gss_refresh_null(struct rpc_task *task)
 static __be32 *
 gss_validate(struct rpc_task *task, __be32 *p)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
 	__be32		seq;
 	struct kvec	iov;
@@ -1400,7 +1396,7 @@ static int
 gss_wrap_req(struct rpc_task *task,
 	     kxdrproc_t encode, void *rqstp, __be32 *p, void *obj)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	struct gss_cred	*gss_cred = container_of(cred, struct gss_cred,
 			gc_base);
 	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
@@ -1503,7 +1499,7 @@ static int
 gss_unwrap_resp(struct rpc_task *task,
 		kxdrproc_t decode, void *rqstp, __be32 *p, void *obj)
 {
-	struct rpc_cred *cred = task->tk_msg.rpc_cred;
+	struct rpc_cred *cred = task->tk_rqstp->rq_cred;
 	struct gss_cred *gss_cred = container_of(cred, struct gss_cred,
 			gc_base);
 	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c
index 310b78e9945..c586e92bcf7 100644
--- a/net/sunrpc/auth_gss/gss_generic_token.c
+++ b/net/sunrpc/auth_gss/gss_generic_token.c
@@ -76,19 +76,19 @@ static int
 der_length_size( int length)
 {
 	if (length < (1<<7))
-		return(1);
+		return 1;
 	else if (length < (1<<8))
-		return(2);
+		return 2;
 #if (SIZEOF_INT == 2)
 	else
-		return(3);
+		return 3;
 #else
 	else if (length < (1<<16))
-		return(3);
+		return 3;
 	else if (length < (1<<24))
-		return(4);
+		return 4;
 	else
-		return(5);
+		return 5;
 #endif
 }
 
@@ -121,14 +121,14 @@ der_read_length(unsigned char **buf, int *bufsize)
 	int ret;
 
 	if (*bufsize < 1)
-		return(-1);
+		return -1;
 	sf = *(*buf)++;
 	(*bufsize)--;
 	if (sf & 0x80) {
 		if ((sf &= 0x7f) > ((*bufsize)-1))
-			return(-1);
+			return -1;
 		if (sf > SIZEOF_INT)
-			return (-1);
+			return -1;
 		ret = 0;
 		for (; sf; sf--) {
 			ret = (ret<<8) + (*(*buf)++);
@@ -138,7 +138,7 @@ der_read_length(unsigned char **buf, int *bufsize)
 		ret = sf;
 	}
 
-	return(ret);
+	return ret;
 }
 
 /* returns the length of a token, given the mech oid and the body size */
@@ -148,7 +148,7 @@ g_token_size(struct xdr_netobj *mech, unsigned int body_size)
 {
 	/* set body_size to sequence contents size */
 	body_size += 2 + (int) mech->len;         /* NEED overflow check */
-	return(1 + der_length_size(body_size) + body_size);
+	return 1 + der_length_size(body_size) + body_size;
 }
 
 EXPORT_SYMBOL_GPL(g_token_size);
@@ -186,27 +186,27 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size,
 	int ret = 0;
 
 	if ((toksize-=1) < 0)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 	if (*buf++ != 0x60)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 
 	if ((seqsize = der_read_length(&buf, &toksize)) < 0)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 
 	if (seqsize != toksize)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 
 	if ((toksize-=1) < 0)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 	if (*buf++ != 0x06)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 
 	if ((toksize-=1) < 0)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 	toid.len = *buf++;
 
 	if ((toksize-=toid.len) < 0)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 	toid.data = buf;
 	buf+=toid.len;
 
@@ -217,17 +217,17 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size,
       to return G_BAD_TOK_HEADER if the token header is in fact bad */
 
 	if ((toksize-=2) < 0)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 
 	if (ret)
-		return(ret);
+		return ret;
 
 	if (!ret) {
 		*buf_in = buf;
 		*body_size = toksize;
 	}
 
-	return(ret);
+	return ret;
 }
 
 EXPORT_SYMBOL_GPL(g_verify_token_header);
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 03264461052..f375decc024 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -237,6 +237,7 @@ get_key(const void *p, const void *end,
 	if (!supported_gss_krb5_enctype(alg)) {
 		printk(KERN_WARNING "gss_kerberos_mech: unsupported "
 			"encryption key algorithm %d\n", alg);
+		p = ERR_PTR(-EINVAL);
 		goto out_err;
 	}
 	p = simple_get_netobj(p, end, &key);
@@ -282,15 +283,19 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx)
 	ctx->enctype = ENCTYPE_DES_CBC_RAW;
 
 	ctx->gk5e = get_gss_krb5_enctype(ctx->enctype);
-	if (ctx->gk5e == NULL)
+	if (ctx->gk5e == NULL) {
+		p = ERR_PTR(-EINVAL);
 		goto out_err;
+	}
 
 	/* The downcall format was designed before we completely understood
 	 * the uses of the context fields; so it includes some stuff we
 	 * just give some minimal sanity-checking, and some we ignore
 	 * completely (like the next twenty bytes): */
-	if (unlikely(p + 20 > end || p + 20 < p))
+	if (unlikely(p + 20 > end || p + 20 < p)) {
+		p = ERR_PTR(-EFAULT);
 		goto out_err;
+	}
 	p += 20;
 	p = simple_get_bytes(p, end, &tmp, sizeof(tmp));
 	if (IS_ERR(p))
@@ -422,7 +427,7 @@ static int
 context_derive_keys_rc4(struct krb5_ctx *ctx)
 {
 	struct crypto_hash *hmac;
-	char sigkeyconstant[] = "signaturekey";
+	static const char sigkeyconstant[] = "signaturekey";
 	int slen = strlen(sigkeyconstant) + 1;	/* include null terminator */
 	struct hash_desc desc;
 	struct scatterlist sg[1];
@@ -619,6 +624,7 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx,
 	if (ctx->seq_send64 != ctx->seq_send) {
 		dprintk("%s: seq_send64 %lx, seq_send %x overflow?\n", __func__,
 			(long unsigned)ctx->seq_send64, ctx->seq_send);
+		p = ERR_PTR(-EINVAL);
 		goto out_err;
 	}
 	p = simple_get_bytes(p, end, &ctx->enctype, sizeof(ctx->enctype));
diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
index 415c013ba38..62ac90c62cb 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
@@ -162,5 +162,5 @@ krb5_get_seq_num(struct krb5_ctx *kctx,
 	*seqnum = ((plain[0]) |
 		   (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24));
 
-	return (0);
+	return 0;
 }
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 2689de39dc7..8b4061049d7 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -331,7 +331,7 @@ gss_delete_sec_context(struct gss_ctx	**context_handle)
 			*context_handle);
 
 	if (!*context_handle)
-		return(GSS_S_NO_CONTEXT);
+		return GSS_S_NO_CONTEXT;
 	if ((*context_handle)->internal_ctx_id)
 		(*context_handle)->mech_type->gm_ops
 			->gss_delete_sec_context((*context_handle)
diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c
deleted file mode 100644
index dc3f1f5ed86..00000000000
--- a/net/sunrpc/auth_gss/gss_spkm3_mech.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- *  linux/net/sunrpc/gss_spkm3_mech.c
- *
- *  Copyright (c) 2003 The Regents of the University of Michigan.
- *  All rights reserved.
- *
- *  Andy Adamson <andros@umich.edu>
- *  J. Bruce Fields <bfields@umich.edu>
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions
- *  are met:
- *
- *  1. Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *  3. Neither the name of the University nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/sunrpc/auth.h>
-#include <linux/in.h>
-#include <linux/sunrpc/svcauth_gss.h>
-#include <linux/sunrpc/gss_spkm3.h>
-#include <linux/sunrpc/xdr.h>
-#include <linux/crypto.h>
-
-#ifdef RPC_DEBUG
-# define RPCDBG_FACILITY	RPCDBG_AUTH
-#endif
-
-static const void *
-simple_get_bytes(const void *p, const void *end, void *res, int len)
-{
-	const void *q = (const void *)((const char *)p + len);
-	if (unlikely(q > end || q < p))
-		return ERR_PTR(-EFAULT);
-	memcpy(res, p, len);
-	return q;
-}
-
-static const void *
-simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res)
-{
-	const void *q;
-	unsigned int len;
-	p = simple_get_bytes(p, end, &len, sizeof(len));
-	if (IS_ERR(p))
-		return p;
-	res->len = len;
-	if (len == 0) {
-		res->data = NULL;
-		return p;
-	}
-	q = (const void *)((const char *)p + len);
-	if (unlikely(q > end || q < p))
-		return ERR_PTR(-EFAULT);
-	res->data = kmemdup(p, len, GFP_NOFS);
-	if (unlikely(res->data == NULL))
-		return ERR_PTR(-ENOMEM);
-	return q;
-}
-
-static int
-gss_import_sec_context_spkm3(const void *p, size_t len,
-				struct gss_ctx *ctx_id,
-				gfp_t gfp_mask)
-{
-	const void *end = (const void *)((const char *)p + len);
-	struct	spkm3_ctx *ctx;
-	int	version;
-
-	if (!(ctx = kzalloc(sizeof(*ctx), gfp_mask)))
-		goto out_err;
-
-	p = simple_get_bytes(p, end, &version, sizeof(version));
-	if (IS_ERR(p))
-		goto out_err_free_ctx;
-	if (version != 1) {
-		dprintk("RPC:       unknown spkm3 token format: "
-				"obsolete nfs-utils?\n");
-		goto out_err_free_ctx;
-	}
-
-	p = simple_get_netobj(p, end, &ctx->ctx_id);
-	if (IS_ERR(p))
-		goto out_err_free_ctx;
-
-	p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime));
-	if (IS_ERR(p))
-		goto out_err_free_ctx_id;
-
-	p = simple_get_netobj(p, end, &ctx->mech_used);
-	if (IS_ERR(p))
-		goto out_err_free_ctx_id;
-
-	p = simple_get_bytes(p, end, &ctx->ret_flags, sizeof(ctx->ret_flags));
-	if (IS_ERR(p))
-		goto out_err_free_mech;
-
-	p = simple_get_netobj(p, end, &ctx->conf_alg);
-	if (IS_ERR(p))
-		goto out_err_free_mech;
-
-	p = simple_get_netobj(p, end, &ctx->derived_conf_key);
-	if (IS_ERR(p))
-		goto out_err_free_conf_alg;
-
-	p = simple_get_netobj(p, end, &ctx->intg_alg);
-	if (IS_ERR(p))
-		goto out_err_free_conf_key;
-
-	p = simple_get_netobj(p, end, &ctx->derived_integ_key);
-	if (IS_ERR(p))
-		goto out_err_free_intg_alg;
-
-	if (p != end)
-		goto out_err_free_intg_key;
-
-	ctx_id->internal_ctx_id = ctx;
-
-	dprintk("RPC:       Successfully imported new spkm context.\n");
-	return 0;
-
-out_err_free_intg_key:
-	kfree(ctx->derived_integ_key.data);
-out_err_free_intg_alg:
-	kfree(ctx->intg_alg.data);
-out_err_free_conf_key:
-	kfree(ctx->derived_conf_key.data);
-out_err_free_conf_alg:
-	kfree(ctx->conf_alg.data);
-out_err_free_mech:
-	kfree(ctx->mech_used.data);
-out_err_free_ctx_id:
-	kfree(ctx->ctx_id.data);
-out_err_free_ctx:
-	kfree(ctx);
-out_err:
-	return PTR_ERR(p);
-}
-
-static void
-gss_delete_sec_context_spkm3(void *internal_ctx)
-{
-	struct spkm3_ctx *sctx = internal_ctx;
-
-	kfree(sctx->derived_integ_key.data);
-	kfree(sctx->intg_alg.data);
-	kfree(sctx->derived_conf_key.data);
-	kfree(sctx->conf_alg.data);
-	kfree(sctx->mech_used.data);
-	kfree(sctx->ctx_id.data);
-	kfree(sctx);
-}
-
-static u32
-gss_verify_mic_spkm3(struct gss_ctx		*ctx,
-			struct xdr_buf		*signbuf,
-			struct xdr_netobj	*checksum)
-{
-	u32 maj_stat = 0;
-	struct spkm3_ctx *sctx = ctx->internal_ctx_id;
-
-	maj_stat = spkm3_read_token(sctx, checksum, signbuf, SPKM_MIC_TOK);
-
-	dprintk("RPC:       gss_verify_mic_spkm3 returning %d\n", maj_stat);
-	return maj_stat;
-}
-
-static u32
-gss_get_mic_spkm3(struct gss_ctx	*ctx,
-		     struct xdr_buf	*message_buffer,
-		     struct xdr_netobj	*message_token)
-{
-	u32 err = 0;
-	struct spkm3_ctx *sctx = ctx->internal_ctx_id;
-
-	err = spkm3_make_token(sctx, message_buffer,
-				message_token, SPKM_MIC_TOK);
-	dprintk("RPC:       gss_get_mic_spkm3 returning %d\n", err);
-	return err;
-}
-
-static const struct gss_api_ops gss_spkm3_ops = {
-	.gss_import_sec_context	= gss_import_sec_context_spkm3,
-	.gss_get_mic		= gss_get_mic_spkm3,
-	.gss_verify_mic		= gss_verify_mic_spkm3,
-	.gss_delete_sec_context	= gss_delete_sec_context_spkm3,
-};
-
-static struct pf_desc gss_spkm3_pfs[] = {
-	{RPC_AUTH_GSS_SPKM, RPC_GSS_SVC_NONE, "spkm3"},
-	{RPC_AUTH_GSS_SPKMI, RPC_GSS_SVC_INTEGRITY, "spkm3i"},
-};
-
-static struct gss_api_mech gss_spkm3_mech = {
-	.gm_name	= "spkm3",
-	.gm_owner	= THIS_MODULE,
-	.gm_oid		= {7, "\053\006\001\005\005\001\003"},
-	.gm_ops		= &gss_spkm3_ops,
-	.gm_pf_num	= ARRAY_SIZE(gss_spkm3_pfs),
-	.gm_pfs		= gss_spkm3_pfs,
-};
-
-static int __init init_spkm3_module(void)
-{
-	int status;
-
-	status = gss_mech_register(&gss_spkm3_mech);
-	if (status)
-		printk("Failed to register spkm3 gss mechanism!\n");
-	return status;
-}
-
-static void __exit cleanup_spkm3_module(void)
-{
-	gss_mech_unregister(&gss_spkm3_mech);
-}
-
-MODULE_LICENSE("GPL");
-module_init(init_spkm3_module);
-module_exit(cleanup_spkm3_module);
diff --git a/net/sunrpc/auth_gss/gss_spkm3_seal.c b/net/sunrpc/auth_gss/gss_spkm3_seal.c
deleted file mode 100644
index 5a3a65a0e2b..00000000000
--- a/net/sunrpc/auth_gss/gss_spkm3_seal.c
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- *  linux/net/sunrpc/gss_spkm3_seal.c
- *
- *  Copyright (c) 2003 The Regents of the University of Michigan.
- *  All rights reserved.
- *
- *  Andy Adamson <andros@umich.edu>
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions
- *  are met:
- *
- *  1. Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *  3. Neither the name of the University nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/types.h>
-#include <linux/jiffies.h>
-#include <linux/sunrpc/gss_spkm3.h>
-#include <linux/random.h>
-#include <linux/crypto.h>
-#include <linux/pagemap.h>
-#include <linux/scatterlist.h>
-#include <linux/sunrpc/xdr.h>
-
-#ifdef RPC_DEBUG
-# define RPCDBG_FACILITY        RPCDBG_AUTH
-#endif
-
-const struct xdr_netobj hmac_md5_oid = { 8, "\x2B\x06\x01\x05\x05\x08\x01\x01"};
-const struct xdr_netobj cast5_cbc_oid = {9, "\x2A\x86\x48\x86\xF6\x7D\x07\x42\x0A"};
-
-/*
- * spkm3_make_token()
- *
- * Only SPKM_MIC_TOK with md5 intg-alg is supported
- */
-
-u32
-spkm3_make_token(struct spkm3_ctx *ctx,
-		   struct xdr_buf * text, struct xdr_netobj * token,
-		   int toktype)
-{
-	s32			checksum_type;
-	char			tokhdrbuf[25];
-	char			cksumdata[16];
-	struct xdr_netobj	md5cksum = {.len = 0, .data = cksumdata};
-	struct xdr_netobj	mic_hdr = {.len = 0, .data = tokhdrbuf};
-	int			tokenlen = 0;
-	unsigned char		*ptr;
-	s32			now;
-	int			ctxelen = 0, ctxzbit = 0;
-	int			md5elen = 0, md5zbit = 0;
-
-	now = jiffies;
-
-	if (ctx->ctx_id.len != 16) {
-		dprintk("RPC:       spkm3_make_token BAD ctx_id.len %d\n",
-				ctx->ctx_id.len);
-		goto out_err;
-	}
-
-	if (!g_OID_equal(&ctx->intg_alg, &hmac_md5_oid)) {
-		dprintk("RPC:       gss_spkm3_seal: unsupported I-ALG "
-				"algorithm.  only support hmac-md5 I-ALG.\n");
-		goto out_err;
-	} else
-		checksum_type = CKSUMTYPE_HMAC_MD5;
-
-	if (!g_OID_equal(&ctx->conf_alg, &cast5_cbc_oid)) {
-		dprintk("RPC:       gss_spkm3_seal: unsupported C-ALG "
-				"algorithm\n");
-		goto out_err;
-	}
-
-	if (toktype == SPKM_MIC_TOK) {
-		/* Calculate checksum over the mic-header */
-		asn1_bitstring_len(&ctx->ctx_id, &ctxelen, &ctxzbit);
-		spkm3_mic_header(&mic_hdr.data, &mic_hdr.len, ctx->ctx_id.data,
-				ctxelen, ctxzbit);
-		if (make_spkm3_checksum(checksum_type, &ctx->derived_integ_key,
-					(char *)mic_hdr.data, mic_hdr.len,
-					text, 0, &md5cksum))
-			goto out_err;
-
-		asn1_bitstring_len(&md5cksum, &md5elen, &md5zbit);
-		tokenlen = 10 + ctxelen + 1 + md5elen + 1;
-
-		/* Create token header using generic routines */
-		token->len = g_token_size(&ctx->mech_used, tokenlen + 2);
-
-		ptr = token->data;
-		g_make_token_header(&ctx->mech_used, tokenlen + 2, &ptr);
-
-		spkm3_make_mic_token(&ptr, tokenlen, &mic_hdr, &md5cksum, md5elen, md5zbit);
-	} else if (toktype == SPKM_WRAP_TOK) { /* Not Supported */
-		dprintk("RPC:       gss_spkm3_seal: SPKM_WRAP_TOK "
-				"not supported\n");
-		goto out_err;
-	}
-
-	/* XXX need to implement sequence numbers, and ctx->expired */
-
-	return  GSS_S_COMPLETE;
-out_err:
-	token->data = NULL;
-	token->len = 0;
-	return GSS_S_FAILURE;
-}
-
-static int
-spkm3_checksummer(struct scatterlist *sg, void *data)
-{
-	struct hash_desc *desc = data;
-
-	return crypto_hash_update(desc, sg, sg->length);
-}
-
-/* checksum the plaintext data and hdrlen bytes of the token header */
-s32
-make_spkm3_checksum(s32 cksumtype, struct xdr_netobj *key, char *header,
-		    unsigned int hdrlen, struct xdr_buf *body,
-		    unsigned int body_offset, struct xdr_netobj *cksum)
-{
-	char				*cksumname;
-	struct hash_desc		desc; /* XXX add to ctx? */
-	struct scatterlist		sg[1];
-	int err;
-
-	switch (cksumtype) {
-		case CKSUMTYPE_HMAC_MD5:
-			cksumname = "hmac(md5)";
-			break;
-		default:
-			dprintk("RPC:       spkm3_make_checksum:"
-					" unsupported checksum %d", cksumtype);
-			return GSS_S_FAILURE;
-	}
-
-	if (key->data == NULL || key->len <= 0) return GSS_S_FAILURE;
-
-	desc.tfm = crypto_alloc_hash(cksumname, 0, CRYPTO_ALG_ASYNC);
-	if (IS_ERR(desc.tfm))
-		return GSS_S_FAILURE;
-	cksum->len = crypto_hash_digestsize(desc.tfm);
-	desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-
-	err = crypto_hash_setkey(desc.tfm, key->data, key->len);
-	if (err)
-		goto out;
-
-	err = crypto_hash_init(&desc);
-	if (err)
-		goto out;
-
-	sg_init_one(sg, header, hdrlen);
-	crypto_hash_update(&desc, sg, sg->length);
-
-	xdr_process_buf(body, body_offset, body->len - body_offset,
-			spkm3_checksummer, &desc);
-	crypto_hash_final(&desc, cksum->data);
-
-out:
-	crypto_free_hash(desc.tfm);
-
-	return err ? GSS_S_FAILURE : 0;
-}
diff --git a/net/sunrpc/auth_gss/gss_spkm3_token.c b/net/sunrpc/auth_gss/gss_spkm3_token.c
deleted file mode 100644
index a99825d7caa..00000000000
--- a/net/sunrpc/auth_gss/gss_spkm3_token.c
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- *  linux/net/sunrpc/gss_spkm3_token.c
- *
- *  Copyright (c) 2003 The Regents of the University of Michigan.
- *  All rights reserved.
- *
- *  Andy Adamson <andros@umich.edu>
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions
- *  are met:
- *
- *  1. Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *  3. Neither the name of the University nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/jiffies.h>
-#include <linux/sunrpc/gss_spkm3.h>
-#include <linux/random.h>
-#include <linux/crypto.h>
-
-#ifdef RPC_DEBUG
-# define RPCDBG_FACILITY        RPCDBG_AUTH
-#endif
-
-/*
- * asn1_bitstring_len()
- *
- * calculate the asn1 bitstring length of the xdr_netobject
- */
-void
-asn1_bitstring_len(struct xdr_netobj *in, int *enclen, int *zerobits)
-{
-	int i, zbit = 0,elen = in->len;
-	char *ptr;
-
-	ptr = &in->data[in->len -1];
-
-	/* count trailing 0's */
-	for(i = in->len; i > 0; i--) {
-		if (*ptr == 0) {
-			ptr--;
-			elen--;
-		} else
-			break;
-	}
-
-	/* count number of 0 bits in final octet */
-	ptr = &in->data[elen - 1];
-	for(i = 0; i < 8; i++) {
-		short mask = 0x01;
-
-		if (!((mask << i) & *ptr))
-			zbit++;
-		else
-			break;
-	}
-	*enclen = elen;
-	*zerobits = zbit;
-}
-
-/*
- * decode_asn1_bitstring()
- *
- * decode a bitstring into a buffer of the expected length.
- * enclen = bit string length
- * explen = expected length (define in rfc)
- */
-int
-decode_asn1_bitstring(struct xdr_netobj *out, char *in, int enclen, int explen)
-{
-	if (!(out->data = kzalloc(explen,GFP_NOFS)))
-		return 0;
-	out->len = explen;
-	memcpy(out->data, in, enclen);
-	return 1;
-}
-
-/*
- * SPKMInnerContextToken choice SPKM_MIC asn1 token layout
- *
- * contextid is always 16 bytes plain data. max asn1 bitstring len = 17.
- *
- * tokenlen = pos[0] to end of token (max pos[45] with MD5 cksum)
- *
- * pos  value
- * ----------
- * [0]	a4  SPKM-MIC tag
- * [1]	??  innertoken length  (max 44)
- *
- *
- * tok_hdr piece of checksum data starts here
- *
- * the maximum mic-header len = 9 + 17 = 26
- *	mic-header
- *	----------
- * [2]	30      SEQUENCE tag
- * [3]	??	mic-header length: (max 23) = TokenID + ContextID
- *
- *		TokenID  - all fields constant and can be hardcoded
- *		-------
- * [4]	  02	Type 2
- * [5]	  02	Length 2
- * [6][7] 01 01	TokenID (SPKM_MIC_TOK)
- *
- *		ContextID  - encoded length not constant, calculated
- *		---------
- * [8]	03	Type 3
- * [9]	??	encoded length
- * [10]	??	ctxzbit
- * [11]	 	contextid
- *
- * mic_header piece of checksum data ends here.
- *
- *	int-cksum - encoded length not constant, calculated
- *	---------
- * [??]	03	Type 3
- * [??]	??	encoded length
- * [??]	??	md5zbit
- * [??]	 	int-cksum (NID_md5 = 16)
- *
- * maximum SPKM-MIC innercontext token length =
- *	 10 + encoded contextid_size(17 max) + 2 + encoded
- *       cksum_size (17 maxfor NID_md5) = 46
- */
-
-/*
- * spkm3_mic_header()
- *
- * Prepare the SPKM_MIC_TOK mic-header for check-sum calculation
- * elen: 16 byte context id asn1 bitstring encoded length
- */
-void
-spkm3_mic_header(unsigned char **hdrbuf, unsigned int *hdrlen, unsigned char *ctxdata, int elen, int zbit)
-{
-	char *hptr = *hdrbuf;
-	char *top = *hdrbuf;
-
-	*(u8 *)hptr++ = 0x30;
-	*(u8 *)hptr++ = elen + 7;  /* on the wire header length */
-
-	/* tokenid */
-	*(u8 *)hptr++ = 0x02;
-	*(u8 *)hptr++ = 0x02;
-	*(u8 *)hptr++ = 0x01;
-	*(u8 *)hptr++ = 0x01;
-
-	/* coniextid */
-	*(u8 *)hptr++ = 0x03;
-	*(u8 *)hptr++ = elen + 1; /* add 1 to include zbit */
-	*(u8 *)hptr++ = zbit;
-	memcpy(hptr, ctxdata, elen);
-	hptr += elen;
-	*hdrlen = hptr - top;
-}
-
-/*
- * spkm3_mic_innercontext_token()
- *
- * *tokp points to the beginning of the SPKM_MIC token  described
- * in rfc 2025, section 3.2.1:
- *
- * toklen is the inner token length
- */
-void
-spkm3_make_mic_token(unsigned char **tokp, int toklen, struct xdr_netobj *mic_hdr, struct xdr_netobj *md5cksum, int md5elen, int md5zbit)
-{
-	unsigned char *ict = *tokp;
-
-	*(u8 *)ict++ = 0xa4;
-	*(u8 *)ict++ = toklen;
-	memcpy(ict, mic_hdr->data, mic_hdr->len);
-	ict += mic_hdr->len;
-
-	*(u8 *)ict++ = 0x03;
-	*(u8 *)ict++ = md5elen + 1; /* add 1 to include zbit */
-	*(u8 *)ict++ = md5zbit;
-	memcpy(ict, md5cksum->data, md5elen);
-}
-
-u32
-spkm3_verify_mic_token(unsigned char **tokp, int *mic_hdrlen, unsigned char **cksum)
-{
-	struct xdr_netobj       spkm3_ctx_id = {.len =0, .data = NULL};
-	unsigned char 		*ptr = *tokp;
-	int 			ctxelen;
-	u32     		ret = GSS_S_DEFECTIVE_TOKEN;
-
-	/* spkm3 innercontext token preamble */
-	if ((ptr[0] != 0xa4) || (ptr[2] != 0x30)) {
-		dprintk("RPC:       BAD SPKM ictoken preamble\n");
-		goto out;
-	}
-
-	*mic_hdrlen = ptr[3];
-
-	/* token type */
-	if ((ptr[4] != 0x02) || (ptr[5] != 0x02)) {
-		dprintk("RPC:       BAD asn1 SPKM3 token type\n");
-		goto out;
-	}
-
-	/* only support SPKM_MIC_TOK */
-	if((ptr[6] != 0x01) || (ptr[7] != 0x01)) {
-		dprintk("RPC:       ERROR unsupported SPKM3 token\n");
-		goto out;
-	}
-
-	/* contextid */
-	if (ptr[8] != 0x03) {
-		dprintk("RPC:       BAD SPKM3 asn1 context-id type\n");
-		goto out;
-	}
-
-	ctxelen = ptr[9];
-	if (ctxelen > 17) {  /* length includes asn1 zbit octet */
-		dprintk("RPC:       BAD SPKM3 contextid len %d\n", ctxelen);
-		goto out;
-	}
-
-	/* ignore ptr[10] */
-
-	if(!decode_asn1_bitstring(&spkm3_ctx_id, &ptr[11], ctxelen - 1, 16))
-		goto out;
-
-	/*
-	* in the current implementation: the optional int-alg is not present
-	* so the default int-alg (md5) is used the optional snd-seq field is
-	* also not present
-	*/
-
-	if (*mic_hdrlen != 6 + ctxelen) {
-		dprintk("RPC:       BAD SPKM_ MIC_TOK header len %d: we only "
-				"support default int-alg (should be absent) "
-				"and do not support snd-seq\n", *mic_hdrlen);
-		goto out;
-	}
-	/* checksum */
-	*cksum = (&ptr[10] + ctxelen); /* ctxelen includes ptr[10] */
-
-	ret = GSS_S_COMPLETE;
-out:
-	kfree(spkm3_ctx_id.data);
-	return ret;
-}
-
diff --git a/net/sunrpc/auth_gss/gss_spkm3_unseal.c b/net/sunrpc/auth_gss/gss_spkm3_unseal.c
deleted file mode 100644
index cc21ee860bb..00000000000
--- a/net/sunrpc/auth_gss/gss_spkm3_unseal.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- *  linux/net/sunrpc/gss_spkm3_unseal.c
- *
- *  Copyright (c) 2003 The Regents of the University of Michigan.
- *  All rights reserved.
- *
- *  Andy Adamson <andros@umich.edu>
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions
- *  are met:
- *
- *  1. Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
- *  3. Neither the name of the University nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
- *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/jiffies.h>
-#include <linux/sunrpc/gss_spkm3.h>
-#include <linux/crypto.h>
-
-#ifdef RPC_DEBUG
-# define RPCDBG_FACILITY        RPCDBG_AUTH
-#endif
-
-/*
- * spkm3_read_token()
- *
- * only SPKM_MIC_TOK with md5 intg-alg is supported
- */
-u32
-spkm3_read_token(struct spkm3_ctx *ctx,
-		struct xdr_netobj *read_token,    /* checksum */
-		struct xdr_buf *message_buffer, /* signbuf */
-		int toktype)
-{
-	s32			checksum_type;
-	s32			code;
-	struct xdr_netobj	wire_cksum = {.len =0, .data = NULL};
-	char			cksumdata[16];
-	struct xdr_netobj	md5cksum = {.len = 0, .data = cksumdata};
-	unsigned char		*ptr = (unsigned char *)read_token->data;
-	unsigned char		*cksum;
-	int			bodysize, md5elen;
-	int			mic_hdrlen;
-	u32			ret = GSS_S_DEFECTIVE_TOKEN;
-
-	if (g_verify_token_header((struct xdr_netobj *) &ctx->mech_used,
-					&bodysize, &ptr, read_token->len))
-		goto out;
-
-	/* decode the token */
-
-	if (toktype != SPKM_MIC_TOK) {
-		dprintk("RPC:       BAD SPKM3 token type: %d\n", toktype);
-		goto out;
-	}
-
-	if ((ret = spkm3_verify_mic_token(&ptr, &mic_hdrlen, &cksum)))
-		goto out;
-
-	if (*cksum++ != 0x03) {
-		dprintk("RPC:       spkm3_read_token BAD checksum type\n");
-		goto out;
-	}
-	md5elen = *cksum++;
-	cksum++; 	/* move past the zbit */
-
-	if (!decode_asn1_bitstring(&wire_cksum, cksum, md5elen - 1, 16))
-		goto out;
-
-	/* HARD CODED FOR MD5 */
-
-	/* compute the checksum of the message.
-	 * ptr + 2 = start of header piece of checksum
-	 * mic_hdrlen + 2 = length of header piece of checksum
-	 */
-	ret = GSS_S_DEFECTIVE_TOKEN;
-	if (!g_OID_equal(&ctx->intg_alg, &hmac_md5_oid)) {
-		dprintk("RPC:       gss_spkm3_seal: unsupported I-ALG "
-				"algorithm\n");
-		goto out;
-	}
-
-	checksum_type = CKSUMTYPE_HMAC_MD5;
-
-	code = make_spkm3_checksum(checksum_type,
-		&ctx->derived_integ_key, ptr + 2, mic_hdrlen + 2,
-		message_buffer, 0, &md5cksum);
-
-	if (code)
-		goto out;
-
-	ret = GSS_S_BAD_SIG;
-	code = memcmp(md5cksum.data, wire_cksum.data, wire_cksum.len);
-	if (code) {
-		dprintk("RPC:       bad MIC checksum\n");
-		goto out;
-	}
-
-
-	/* XXX: need to add expiration and sequencing */
-	ret = GSS_S_COMPLETE;
-out:
-	kfree(wire_cksum.data);
-	return ret;
-}
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index cc385b3a59c..dec2a6fc7c1 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -964,7 +964,7 @@ svcauth_gss_set_client(struct svc_rqst *rqstp)
 	if (rqstp->rq_gssclient == NULL)
 		return SVC_DENIED;
 	stat = svcauth_unix_set_client(rqstp);
-	if (stat == SVC_DROP)
+	if (stat == SVC_DROP || stat == SVC_CLOSE)
 		return stat;
 	return SVC_OK;
 }
@@ -1018,7 +1018,7 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
 		return SVC_DENIED;
 	memset(&rsikey, 0, sizeof(rsikey));
 	if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx))
-		return SVC_DROP;
+		return SVC_CLOSE;
 	*authp = rpc_autherr_badverf;
 	if (svc_safe_getnetobj(argv, &tmpobj)) {
 		kfree(rsikey.in_handle.data);
@@ -1026,38 +1026,35 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp,
 	}
 	if (dup_netobj(&rsikey.in_token, &tmpobj)) {
 		kfree(rsikey.in_handle.data);
-		return SVC_DROP;
+		return SVC_CLOSE;
 	}
 
 	/* Perform upcall, or find upcall result: */
 	rsip = rsi_lookup(&rsikey);
 	rsi_free(&rsikey);
 	if (!rsip)
-		return SVC_DROP;
-	switch (cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle)) {
-	case -EAGAIN:
-	case -ETIMEDOUT:
-	case -ENOENT:
+		return SVC_CLOSE;
+	if (cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle) < 0)
 		/* No upcall result: */
-		return SVC_DROP;
-	case 0:
-		ret = SVC_DROP;
-		/* Got an answer to the upcall; use it: */
-		if (gss_write_init_verf(rqstp, rsip))
-			goto out;
-		if (resv->iov_len + 4 > PAGE_SIZE)
-			goto out;
-		svc_putnl(resv, RPC_SUCCESS);
-		if (svc_safe_putnetobj(resv, &rsip->out_handle))
-			goto out;
-		if (resv->iov_len + 3 * 4 > PAGE_SIZE)
-			goto out;
-		svc_putnl(resv, rsip->major_status);
-		svc_putnl(resv, rsip->minor_status);
-		svc_putnl(resv, GSS_SEQ_WIN);
-		if (svc_safe_putnetobj(resv, &rsip->out_token))
-			goto out;
-	}
+		return SVC_CLOSE;
+
+	ret = SVC_CLOSE;
+	/* Got an answer to the upcall; use it: */
+	if (gss_write_init_verf(rqstp, rsip))
+		goto out;
+	if (resv->iov_len + 4 > PAGE_SIZE)
+		goto out;
+	svc_putnl(resv, RPC_SUCCESS);
+	if (svc_safe_putnetobj(resv, &rsip->out_handle))
+		goto out;
+	if (resv->iov_len + 3 * 4 > PAGE_SIZE)
+		goto out;
+	svc_putnl(resv, rsip->major_status);
+	svc_putnl(resv, rsip->minor_status);
+	svc_putnl(resv, GSS_SEQ_WIN);
+	if (svc_safe_putnetobj(resv, &rsip->out_token))
+		goto out;
+
 	ret = SVC_COMPLETE;
 out:
 	cache_put(&rsip->h, &rsi_cache);
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 1db618f56ec..a5c36c01707 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -75,7 +75,7 @@ nul_marshal(struct rpc_task *task, __be32 *p)
 static int
 nul_refresh(struct rpc_task *task)
 {
-	set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_msg.rpc_cred->cr_flags);
+	set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags);
 	return 0;
 }
 
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index aac2f8b4ee2..4cb70dc6e7a 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -29,7 +29,6 @@ struct unx_cred {
 #endif
 
 static struct rpc_auth		unix_auth;
-static struct rpc_cred_cache	unix_cred_cache;
 static const struct rpc_credops	unix_credops;
 
 static struct rpc_auth *
@@ -141,7 +140,7 @@ static __be32 *
 unx_marshal(struct rpc_task *task, __be32 *p)
 {
 	struct rpc_clnt	*clnt = task->tk_client;
-	struct unx_cred	*cred = container_of(task->tk_msg.rpc_cred, struct unx_cred, uc_base);
+	struct unx_cred	*cred = container_of(task->tk_rqstp->rq_cred, struct unx_cred, uc_base);
 	__be32		*base, *hold;
 	int		i;
 
@@ -174,7 +173,7 @@ unx_marshal(struct rpc_task *task, __be32 *p)
 static int
 unx_refresh(struct rpc_task *task)
 {
-	set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_msg.rpc_cred->cr_flags);
+	set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags);
 	return 0;
 }
 
@@ -197,15 +196,20 @@ unx_validate(struct rpc_task *task, __be32 *p)
 		printk("RPC: giant verf size: %u\n", size);
 		return NULL;
 	}
-	task->tk_msg.rpc_cred->cr_auth->au_rslack = (size >> 2) + 2;
+	task->tk_rqstp->rq_cred->cr_auth->au_rslack = (size >> 2) + 2;
 	p += (size >> 2);
 
 	return p;
 }
 
-void __init rpc_init_authunix(void)
+int __init rpc_init_authunix(void)
 {
-	spin_lock_init(&unix_cred_cache.lock);
+	return rpcauth_init_credcache(&unix_auth);
+}
+
+void rpc_destroy_authunix(void)
+{
+	rpcauth_destroy_credcache(&unix_auth);
 }
 
 const struct rpc_authops authunix_ops = {
@@ -219,17 +223,12 @@ const struct rpc_authops authunix_ops = {
 };
 
 static
-struct rpc_cred_cache	unix_cred_cache = {
-};
-
-static
 struct rpc_auth		unix_auth = {
 	.au_cslack	= UNX_WRITESLACK,
 	.au_rslack	= 2,			/* assume AUTH_NULL verf */
 	.au_ops		= &authunix_ops,
 	.au_flavor	= RPC_AUTH_UNIX,
 	.au_count	= ATOMIC_INIT(0),
-	.au_credcache	= &unix_cred_cache,
 };
 
 static
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 58de76c8540..e433e7580e2 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -28,22 +28,21 @@
 #include <linux/workqueue.h>
 #include <linux/mutex.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <asm/ioctls.h>
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/cache.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
-#include <linux/smp_lock.h>
+#include "netns.h"
 
 #define	 RPCDBG_FACILITY RPCDBG_CACHE
 
-static int cache_defer_req(struct cache_req *req, struct cache_head *item);
+static void cache_defer_req(struct cache_req *req, struct cache_head *item);
 static void cache_revisit_request(struct cache_head *item);
 
 static void cache_init(struct cache_head *h)
 {
-	time_t now = get_seconds();
+	time_t now = seconds_since_boot();
 	h->next = NULL;
 	h->flags = 0;
 	kref_init(&h->ref);
@@ -53,7 +52,7 @@ static void cache_init(struct cache_head *h)
 
 static inline int cache_is_expired(struct cache_detail *detail, struct cache_head *h)
 {
-	return  (h->expiry_time < get_seconds()) ||
+	return  (h->expiry_time < seconds_since_boot()) ||
 		(detail->flush_time > h->last_refresh);
 }
 
@@ -128,7 +127,7 @@ static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch);
 static void cache_fresh_locked(struct cache_head *head, time_t expiry)
 {
 	head->expiry_time = expiry;
-	head->last_refresh = get_seconds();
+	head->last_refresh = seconds_since_boot();
 	set_bit(CACHE_VALID, &head->flags);
 }
 
@@ -239,7 +238,7 @@ int cache_check(struct cache_detail *detail,
 
 	/* now see if we want to start an upcall */
 	refresh_age = (h->expiry_time - h->last_refresh);
-	age = get_seconds() - h->last_refresh;
+	age = seconds_since_boot() - h->last_refresh;
 
 	if (rqstp == NULL) {
 		if (rv == -EAGAIN)
@@ -254,7 +253,7 @@ int cache_check(struct cache_detail *detail,
 				cache_revisit_request(h);
 				if (rv == -EAGAIN) {
 					set_bit(CACHE_NEGATIVE, &h->flags);
-					cache_fresh_locked(h, get_seconds()+CACHE_NEW_EXPIRY);
+					cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY);
 					cache_fresh_unlocked(h, detail);
 					rv = -ENOENT;
 				}
@@ -269,7 +268,8 @@ int cache_check(struct cache_detail *detail,
 	}
 
 	if (rv == -EAGAIN) {
-		if (cache_defer_req(rqstp, h) < 0) {
+		cache_defer_req(rqstp, h);
+		if (!test_bit(CACHE_PENDING, &h->flags)) {
 			/* Request is not deferred */
 			rv = cache_is_valid(detail, h);
 			if (rv == -EAGAIN)
@@ -320,7 +320,7 @@ static struct cache_detail *current_detail;
 static int current_index;
 
 static void do_cache_clean(struct work_struct *work);
-static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean);
+static struct delayed_work cache_cleaner;
 
 static void sunrpc_init_cache_detail(struct cache_detail *cd)
 {
@@ -389,11 +389,11 @@ static int cache_clean(void)
 			return -1;
 		}
 		current_detail = list_entry(next, struct cache_detail, others);
-		if (current_detail->nextcheck > get_seconds())
+		if (current_detail->nextcheck > seconds_since_boot())
 			current_index = current_detail->hash_size;
 		else {
 			current_index = 0;
-			current_detail->nextcheck = get_seconds()+30*60;
+			current_detail->nextcheck = seconds_since_boot()+30*60;
 		}
 	}
 
@@ -478,7 +478,7 @@ EXPORT_SYMBOL_GPL(cache_flush);
 void cache_purge(struct cache_detail *detail)
 {
 	detail->flush_time = LONG_MAX;
-	detail->nextcheck = get_seconds();
+	detail->nextcheck = seconds_since_boot();
 	cache_flush();
 	detail->flush_time = 1;
 }
@@ -507,81 +507,155 @@ EXPORT_SYMBOL_GPL(cache_purge);
 
 static DEFINE_SPINLOCK(cache_defer_lock);
 static LIST_HEAD(cache_defer_list);
-static struct list_head cache_defer_hash[DFR_HASHSIZE];
+static struct hlist_head cache_defer_hash[DFR_HASHSIZE];
 static int cache_defer_cnt;
 
-static int cache_defer_req(struct cache_req *req, struct cache_head *item)
+static void __unhash_deferred_req(struct cache_deferred_req *dreq)
+{
+	hlist_del_init(&dreq->hash);
+	if (!list_empty(&dreq->recent)) {
+		list_del_init(&dreq->recent);
+		cache_defer_cnt--;
+	}
+}
+
+static void __hash_deferred_req(struct cache_deferred_req *dreq, struct cache_head *item)
 {
-	struct cache_deferred_req *dreq, *discard;
 	int hash = DFR_HASH(item);
 
-	if (cache_defer_cnt >= DFR_MAX) {
-		/* too much in the cache, randomly drop this one,
-		 * or continue and drop the oldest below
-		 */
-		if (net_random()&1)
-			return -ENOMEM;
-	}
-	dreq = req->defer(req);
-	if (dreq == NULL)
-		return -ENOMEM;
+	INIT_LIST_HEAD(&dreq->recent);
+	hlist_add_head(&dreq->hash, &cache_defer_hash[hash]);
+}
+
+static void setup_deferral(struct cache_deferred_req *dreq,
+			   struct cache_head *item,
+			   int count_me)
+{
 
 	dreq->item = item;
 
 	spin_lock(&cache_defer_lock);
 
-	list_add(&dreq->recent, &cache_defer_list);
+	__hash_deferred_req(dreq, item);
 
-	if (cache_defer_hash[hash].next == NULL)
-		INIT_LIST_HEAD(&cache_defer_hash[hash]);
-	list_add(&dreq->hash, &cache_defer_hash[hash]);
-
-	/* it is in, now maybe clean up */
-	discard = NULL;
-	if (++cache_defer_cnt > DFR_MAX) {
-		discard = list_entry(cache_defer_list.prev,
-				     struct cache_deferred_req, recent);
-		list_del_init(&discard->recent);
-		list_del_init(&discard->hash);
-		cache_defer_cnt--;
+	if (count_me) {
+		cache_defer_cnt++;
+		list_add(&dreq->recent, &cache_defer_list);
 	}
+
 	spin_unlock(&cache_defer_lock);
 
+}
+
+struct thread_deferred_req {
+	struct cache_deferred_req handle;
+	struct completion completion;
+};
+
+static void cache_restart_thread(struct cache_deferred_req *dreq, int too_many)
+{
+	struct thread_deferred_req *dr =
+		container_of(dreq, struct thread_deferred_req, handle);
+	complete(&dr->completion);
+}
+
+static void cache_wait_req(struct cache_req *req, struct cache_head *item)
+{
+	struct thread_deferred_req sleeper;
+	struct cache_deferred_req *dreq = &sleeper.handle;
+
+	sleeper.completion = COMPLETION_INITIALIZER_ONSTACK(sleeper.completion);
+	dreq->revisit = cache_restart_thread;
+
+	setup_deferral(dreq, item, 0);
+
+	if (!test_bit(CACHE_PENDING, &item->flags) ||
+	    wait_for_completion_interruptible_timeout(
+		    &sleeper.completion, req->thread_wait) <= 0) {
+		/* The completion wasn't completed, so we need
+		 * to clean up
+		 */
+		spin_lock(&cache_defer_lock);
+		if (!hlist_unhashed(&sleeper.handle.hash)) {
+			__unhash_deferred_req(&sleeper.handle);
+			spin_unlock(&cache_defer_lock);
+		} else {
+			/* cache_revisit_request already removed
+			 * this from the hash table, but hasn't
+			 * called ->revisit yet.  It will very soon
+			 * and we need to wait for it.
+			 */
+			spin_unlock(&cache_defer_lock);
+			wait_for_completion(&sleeper.completion);
+		}
+	}
+}
+
+static void cache_limit_defers(void)
+{
+	/* Make sure we haven't exceed the limit of allowed deferred
+	 * requests.
+	 */
+	struct cache_deferred_req *discard = NULL;
+
+	if (cache_defer_cnt <= DFR_MAX)
+		return;
+
+	spin_lock(&cache_defer_lock);
+
+	/* Consider removing either the first or the last */
+	if (cache_defer_cnt > DFR_MAX) {
+		if (net_random() & 1)
+			discard = list_entry(cache_defer_list.next,
+					     struct cache_deferred_req, recent);
+		else
+			discard = list_entry(cache_defer_list.prev,
+					     struct cache_deferred_req, recent);
+		__unhash_deferred_req(discard);
+	}
+	spin_unlock(&cache_defer_lock);
 	if (discard)
-		/* there was one too many */
 		discard->revisit(discard, 1);
+}
 
-	if (!test_bit(CACHE_PENDING, &item->flags)) {
-		/* must have just been validated... */
-		cache_revisit_request(item);
-		return -EAGAIN;
+static void cache_defer_req(struct cache_req *req, struct cache_head *item)
+{
+	struct cache_deferred_req *dreq;
+
+	if (req->thread_wait) {
+		cache_wait_req(req, item);
+		if (!test_bit(CACHE_PENDING, &item->flags))
+			return;
 	}
-	return 0;
+	dreq = req->defer(req);
+	if (dreq == NULL)
+		return;
+	setup_deferral(dreq, item, 1);
+	if (!test_bit(CACHE_PENDING, &item->flags))
+		/* Bit could have been cleared before we managed to
+		 * set up the deferral, so need to revisit just in case
+		 */
+		cache_revisit_request(item);
+
+	cache_limit_defers();
 }
 
 static void cache_revisit_request(struct cache_head *item)
 {
 	struct cache_deferred_req *dreq;
 	struct list_head pending;
-
-	struct list_head *lp;
+	struct hlist_node *lp, *tmp;
 	int hash = DFR_HASH(item);
 
 	INIT_LIST_HEAD(&pending);
 	spin_lock(&cache_defer_lock);
 
-	lp = cache_defer_hash[hash].next;
-	if (lp) {
-		while (lp != &cache_defer_hash[hash]) {
-			dreq = list_entry(lp, struct cache_deferred_req, hash);
-			lp = lp->next;
-			if (dreq->item == item) {
-				list_del_init(&dreq->hash);
-				list_move(&dreq->recent, &pending);
-				cache_defer_cnt--;
-			}
+	hlist_for_each_entry_safe(dreq, lp, tmp, &cache_defer_hash[hash], hash)
+		if (dreq->item == item) {
+			__unhash_deferred_req(dreq);
+			list_add(&dreq->recent, &pending);
 		}
-	}
+
 	spin_unlock(&cache_defer_lock);
 
 	while (!list_empty(&pending)) {
@@ -602,9 +676,8 @@ void cache_clean_deferred(void *owner)
 
 	list_for_each_entry_safe(dreq, tmp, &cache_defer_list, recent) {
 		if (dreq->owner == owner) {
-			list_del_init(&dreq->hash);
-			list_move(&dreq->recent, &pending);
-			cache_defer_cnt--;
+			__unhash_deferred_req(dreq);
+			list_add(&dreq->recent, &pending);
 		}
 	}
 	spin_unlock(&cache_defer_lock);
@@ -903,7 +976,7 @@ static int cache_release(struct inode *inode, struct file *filp,
 		filp->private_data = NULL;
 		kfree(rp);
 
-		cd->last_close = get_seconds();
+		cd->last_close = seconds_since_boot();
 		atomic_dec(&cd->readers);
 	}
 	module_put(cd->owner);
@@ -1016,6 +1089,23 @@ static void warn_no_listener(struct cache_detail *detail)
 	}
 }
 
+static bool cache_listeners_exist(struct cache_detail *detail)
+{
+	if (atomic_read(&detail->readers))
+		return true;
+	if (detail->last_close == 0)
+		/* This cache was never opened */
+		return false;
+	if (detail->last_close < seconds_since_boot() - 30)
+		/*
+		 * We allow for the possibility that someone might
+		 * restart a userspace daemon without restarting the
+		 * server; but after 30 seconds, we give up.
+		 */
+		 return false;
+	return true;
+}
+
 /*
  * register an upcall request to user-space and queue it up for read() by the
  * upcall daemon.
@@ -1034,10 +1124,9 @@ int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h,
 	char *bp;
 	int len;
 
-	if (atomic_read(&detail->readers) == 0 &&
-	    detail->last_close < get_seconds() - 30) {
-			warn_no_listener(detail);
-			return -EINVAL;
+	if (!cache_listeners_exist(detail)) {
+		warn_no_listener(detail);
+		return -EINVAL;
 	}
 
 	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
@@ -1096,13 +1185,19 @@ int qword_get(char **bpp, char *dest, int bufsize)
 	if (bp[0] == '\\' && bp[1] == 'x') {
 		/* HEX STRING */
 		bp += 2;
-		while (isxdigit(bp[0]) && isxdigit(bp[1]) && len < bufsize) {
-			int byte = isdigit(*bp) ? *bp-'0' : toupper(*bp)-'A'+10;
-			bp++;
-			byte <<= 4;
-			byte |= isdigit(*bp) ? *bp-'0' : toupper(*bp)-'A'+10;
-			*dest++ = byte;
-			bp++;
+		while (len < bufsize) {
+			int h, l;
+
+			h = hex_to_bin(bp[0]);
+			if (h < 0)
+				break;
+
+			l = hex_to_bin(bp[1]);
+			if (l < 0)
+				break;
+
+			*dest++ = (h << 4) | l;
+			bp += 2;
 			len++;
 		}
 	} else {
@@ -1220,7 +1315,8 @@ static int c_show(struct seq_file *m, void *p)
 
 	ifdebug(CACHE)
 		seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
-			   cp->expiry_time, atomic_read(&cp->ref.refcount), cp->flags);
+			   convert_to_wallclock(cp->expiry_time),
+			   atomic_read(&cp->ref.refcount), cp->flags);
 	cache_get(cp);
 	if (cache_check(cd, cp, NULL))
 		/* cache_check does a cache_put on failure */
@@ -1286,7 +1382,7 @@ static ssize_t read_flush(struct file *file, char __user *buf,
 	unsigned long p = *ppos;
 	size_t len;
 
-	sprintf(tbuf, "%lu\n", cd->flush_time);
+	sprintf(tbuf, "%lu\n", convert_to_wallclock(cd->flush_time));
 	len = strlen(tbuf);
 	if (p >= len)
 		return 0;
@@ -1304,19 +1400,20 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
 			   struct cache_detail *cd)
 {
 	char tbuf[20];
-	char *ep;
-	long flushtime;
+	char *bp, *ep;
+
 	if (*ppos || count > sizeof(tbuf)-1)
 		return -EINVAL;
 	if (copy_from_user(tbuf, buf, count))
 		return -EFAULT;
 	tbuf[count] = 0;
-	flushtime = simple_strtoul(tbuf, &ep, 0);
+	simple_strtoul(tbuf, &ep, 0);
 	if (*ep && *ep != '\n')
 		return -EINVAL;
 
-	cd->flush_time = flushtime;
-	cd->nextcheck = get_seconds();
+	bp = tbuf;
+	cd->flush_time = get_expiry(&bp);
+	cd->nextcheck = seconds_since_boot();
 	cache_flush();
 
 	*ppos += count;
@@ -1349,15 +1446,10 @@ static unsigned int cache_poll_procfs(struct file *filp, poll_table *wait)
 static long cache_ioctl_procfs(struct file *filp,
 			       unsigned int cmd, unsigned long arg)
 {
-	long ret;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct cache_detail *cd = PDE(inode)->data;
 
-	lock_kernel();
-	ret = cache_ioctl(inode, filp, cmd, arg, cd);
-	unlock_kernel();
-
-	return ret;
+	return cache_ioctl(inode, filp, cmd, arg, cd);
 }
 
 static int cache_open_procfs(struct inode *inode, struct file *filp)
@@ -1442,10 +1534,13 @@ static const struct file_operations cache_flush_operations_procfs = {
 	.read		= read_flush_procfs,
 	.write		= write_flush_procfs,
 	.release	= release_flush_procfs,
+	.llseek		= no_llseek,
 };
 
-static void remove_cache_proc_entries(struct cache_detail *cd)
+static void remove_cache_proc_entries(struct cache_detail *cd, struct net *net)
 {
+	struct sunrpc_net *sn;
+
 	if (cd->u.procfs.proc_ent == NULL)
 		return;
 	if (cd->u.procfs.flush_ent)
@@ -1455,15 +1550,18 @@ static void remove_cache_proc_entries(struct cache_detail *cd)
 	if (cd->u.procfs.content_ent)
 		remove_proc_entry("content", cd->u.procfs.proc_ent);
 	cd->u.procfs.proc_ent = NULL;
-	remove_proc_entry(cd->name, proc_net_rpc);
+	sn = net_generic(net, sunrpc_net_id);
+	remove_proc_entry(cd->name, sn->proc_net_rpc);
 }
 
 #ifdef CONFIG_PROC_FS
-static int create_cache_proc_entries(struct cache_detail *cd)
+static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
 {
 	struct proc_dir_entry *p;
+	struct sunrpc_net *sn;
 
-	cd->u.procfs.proc_ent = proc_mkdir(cd->name, proc_net_rpc);
+	sn = net_generic(net, sunrpc_net_id);
+	cd->u.procfs.proc_ent = proc_mkdir(cd->name, sn->proc_net_rpc);
 	if (cd->u.procfs.proc_ent == NULL)
 		goto out_nomem;
 	cd->u.procfs.channel_ent = NULL;
@@ -1494,33 +1592,48 @@ static int create_cache_proc_entries(struct cache_detail *cd)
 	}
 	return 0;
 out_nomem:
-	remove_cache_proc_entries(cd);
+	remove_cache_proc_entries(cd, net);
 	return -ENOMEM;
 }
 #else /* CONFIG_PROC_FS */
-static int create_cache_proc_entries(struct cache_detail *cd)
+static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
 {
 	return 0;
 }
 #endif
 
-int cache_register(struct cache_detail *cd)
+void __init cache_initialize(void)
+{
+	INIT_DELAYED_WORK_DEFERRABLE(&cache_cleaner, do_cache_clean);
+}
+
+int cache_register_net(struct cache_detail *cd, struct net *net)
 {
 	int ret;
 
 	sunrpc_init_cache_detail(cd);
-	ret = create_cache_proc_entries(cd);
+	ret = create_cache_proc_entries(cd, net);
 	if (ret)
 		sunrpc_destroy_cache_detail(cd);
 	return ret;
 }
+
+int cache_register(struct cache_detail *cd)
+{
+	return cache_register_net(cd, &init_net);
+}
 EXPORT_SYMBOL_GPL(cache_register);
 
-void cache_unregister(struct cache_detail *cd)
+void cache_unregister_net(struct cache_detail *cd, struct net *net)
 {
-	remove_cache_proc_entries(cd);
+	remove_cache_proc_entries(cd, net);
 	sunrpc_destroy_cache_detail(cd);
 }
+
+void cache_unregister(struct cache_detail *cd)
+{
+	cache_unregister_net(cd, &init_net);
+}
 EXPORT_SYMBOL_GPL(cache_unregister);
 
 static ssize_t cache_read_pipefs(struct file *filp, char __user *buf,
@@ -1551,13 +1664,8 @@ static long cache_ioctl_pipefs(struct file *filp,
 {
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct cache_detail *cd = RPC_I(inode)->private;
-	long ret;
-
-	lock_kernel();
-	ret = cache_ioctl(inode, filp, cmd, arg, cd);
-	unlock_kernel();
 
-	return ret;
+	return cache_ioctl(inode, filp, cmd, arg, cd);
 }
 
 static int cache_open_pipefs(struct inode *inode, struct file *filp)
@@ -1642,6 +1750,7 @@ const struct file_operations cache_flush_operations_pipefs = {
 	.read		= read_flush_pipefs,
 	.write		= write_flush_pipefs,
 	.release	= release_flush_pipefs,
+	.llseek		= no_llseek,
 };
 
 int sunrpc_cache_register_pipefs(struct dentry *parent,
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 756fc324db9..9dab9573be4 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -226,7 +226,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
 			goto out_no_principal;
 	}
 
-	kref_init(&clnt->cl_kref);
+	atomic_set(&clnt->cl_count, 1);
 
 	err = rpc_setup_pipedir(clnt, program->pipe_dir_name);
 	if (err < 0)
@@ -284,6 +284,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 	struct rpc_xprt *xprt;
 	struct rpc_clnt *clnt;
 	struct xprt_create xprtargs = {
+		.net = args->net,
 		.ident = args->protocol,
 		.srcaddr = args->saddress,
 		.dstaddr = args->address,
@@ -390,14 +391,14 @@ rpc_clone_client(struct rpc_clnt *clnt)
 		if (new->cl_principal == NULL)
 			goto out_no_principal;
 	}
-	kref_init(&new->cl_kref);
+	atomic_set(&new->cl_count, 1);
 	err = rpc_setup_pipedir(new, clnt->cl_program->pipe_dir_name);
 	if (err != 0)
 		goto out_no_path;
 	if (new->cl_auth)
 		atomic_inc(&new->cl_auth->au_count);
 	xprt_get(clnt->cl_xprt);
-	kref_get(&clnt->cl_kref);
+	atomic_inc(&clnt->cl_count);
 	rpc_register_client(new);
 	rpciod_up();
 	return new;
@@ -414,6 +415,35 @@ out_no_clnt:
 EXPORT_SYMBOL_GPL(rpc_clone_client);
 
 /*
+ * Kill all tasks for the given client.
+ * XXX: kill their descendants as well?
+ */
+void rpc_killall_tasks(struct rpc_clnt *clnt)
+{
+	struct rpc_task	*rovr;
+
+
+	if (list_empty(&clnt->cl_tasks))
+		return;
+	dprintk("RPC:       killing all tasks for client %p\n", clnt);
+	/*
+	 * Spin lock all_tasks to prevent changes...
+	 */
+	spin_lock(&clnt->cl_lock);
+	list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) {
+		if (!RPC_IS_ACTIVATED(rovr))
+			continue;
+		if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
+			rovr->tk_flags |= RPC_TASK_KILLED;
+			rpc_exit(rovr, -EIO);
+			rpc_wake_up_queued_task(rovr->tk_waitqueue, rovr);
+		}
+	}
+	spin_unlock(&clnt->cl_lock);
+}
+EXPORT_SYMBOL_GPL(rpc_killall_tasks);
+
+/*
  * Properly shut down an RPC client, terminating all outstanding
  * requests.
  */
@@ -436,10 +466,8 @@ EXPORT_SYMBOL_GPL(rpc_shutdown_client);
  * Free an RPC client
  */
 static void
-rpc_free_client(struct kref *kref)
+rpc_free_client(struct rpc_clnt *clnt)
 {
-	struct rpc_clnt *clnt = container_of(kref, struct rpc_clnt, cl_kref);
-
 	dprintk("RPC:       destroying %s client for %s\n",
 			clnt->cl_protname, clnt->cl_server);
 	if (!IS_ERR(clnt->cl_path.dentry)) {
@@ -466,12 +494,10 @@ out_free:
  * Free an RPC client
  */
 static void
-rpc_free_auth(struct kref *kref)
+rpc_free_auth(struct rpc_clnt *clnt)
 {
-	struct rpc_clnt *clnt = container_of(kref, struct rpc_clnt, cl_kref);
-
 	if (clnt->cl_auth == NULL) {
-		rpc_free_client(kref);
+		rpc_free_client(clnt);
 		return;
 	}
 
@@ -480,10 +506,11 @@ rpc_free_auth(struct kref *kref)
 	 *       release remaining GSS contexts. This mechanism ensures
 	 *       that it can do so safely.
 	 */
-	kref_init(kref);
+	atomic_inc(&clnt->cl_count);
 	rpcauth_release(clnt->cl_auth);
 	clnt->cl_auth = NULL;
-	kref_put(kref, rpc_free_client);
+	if (atomic_dec_and_test(&clnt->cl_count))
+		rpc_free_client(clnt);
 }
 
 /*
@@ -496,7 +523,8 @@ rpc_release_client(struct rpc_clnt *clnt)
 
 	if (list_empty(&clnt->cl_tasks))
 		wake_up(&destroy_wait);
-	kref_put(&clnt->cl_kref, rpc_free_auth);
+	if (atomic_dec_and_test(&clnt->cl_count))
+		rpc_free_auth(clnt);
 }
 
 /**
@@ -538,6 +566,49 @@ out:
 }
 EXPORT_SYMBOL_GPL(rpc_bind_new_program);
 
+void rpc_task_release_client(struct rpc_task *task)
+{
+	struct rpc_clnt *clnt = task->tk_client;
+
+	if (clnt != NULL) {
+		/* Remove from client task list */
+		spin_lock(&clnt->cl_lock);
+		list_del(&task->tk_task);
+		spin_unlock(&clnt->cl_lock);
+		task->tk_client = NULL;
+
+		rpc_release_client(clnt);
+	}
+}
+
+static
+void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
+{
+	if (clnt != NULL) {
+		rpc_task_release_client(task);
+		task->tk_client = clnt;
+		atomic_inc(&clnt->cl_count);
+		if (clnt->cl_softrtry)
+			task->tk_flags |= RPC_TASK_SOFT;
+		/* Add to the client's list of all tasks */
+		spin_lock(&clnt->cl_lock);
+		list_add_tail(&task->tk_task, &clnt->cl_tasks);
+		spin_unlock(&clnt->cl_lock);
+	}
+}
+
+static void
+rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
+{
+	if (msg != NULL) {
+		task->tk_msg.rpc_proc = msg->rpc_proc;
+		task->tk_msg.rpc_argp = msg->rpc_argp;
+		task->tk_msg.rpc_resp = msg->rpc_resp;
+		if (msg->rpc_cred != NULL)
+			task->tk_msg.rpc_cred = get_rpccred(msg->rpc_cred);
+	}
+}
+
 /*
  * Default callback for async RPC calls
  */
@@ -562,6 +633,18 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
 	if (IS_ERR(task))
 		goto out;
 
+	rpc_task_set_client(task, task_setup_data->rpc_client);
+	rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
+
+	if (task->tk_status != 0) {
+		int ret = task->tk_status;
+		rpc_put_task(task);
+		return ERR_PTR(ret);
+	}
+
+	if (task->tk_action == NULL)
+		rpc_call_start(task);
+
 	atomic_inc(&task->tk_count);
 	rpc_execute(task);
 out:
@@ -756,12 +839,13 @@ EXPORT_SYMBOL_GPL(rpc_force_rebind);
  * Restart an (async) RPC call from the call_prepare state.
  * Usually called from within the exit handler.
  */
-void
+int
 rpc_restart_call_prepare(struct rpc_task *task)
 {
 	if (RPC_ASSASSINATED(task))
-		return;
+		return 0;
 	task->tk_action = rpc_prepare_task;
+	return 1;
 }
 EXPORT_SYMBOL_GPL(rpc_restart_call_prepare);
 
@@ -769,13 +853,13 @@ EXPORT_SYMBOL_GPL(rpc_restart_call_prepare);
  * Restart an (async) RPC call. Usually called from within the
  * exit handler.
  */
-void
+int
 rpc_restart_call(struct rpc_task *task)
 {
 	if (RPC_ASSASSINATED(task))
-		return;
-
+		return 0;
 	task->tk_action = call_start;
+	return 1;
 }
 EXPORT_SYMBOL_GPL(rpc_restart_call);
 
@@ -824,11 +908,6 @@ call_reserve(struct rpc_task *task)
 {
 	dprint_status(task);
 
-	if (!rpcauth_uptodatecred(task)) {
-		task->tk_action = call_refresh;
-		return;
-	}
-
 	task->tk_status  = 0;
 	task->tk_action  = call_reserveresult;
 	xprt_reserve(task);
@@ -851,7 +930,7 @@ call_reserveresult(struct rpc_task *task)
 	task->tk_status = 0;
 	if (status >= 0) {
 		if (task->tk_rqstp) {
-			task->tk_action = call_allocate;
+			task->tk_action = call_refresh;
 			return;
 		}
 
@@ -886,13 +965,54 @@ call_reserveresult(struct rpc_task *task)
 }
 
 /*
- * 2.	Allocate the buffer. For details, see sched.c:rpc_malloc.
+ * 2.	Bind and/or refresh the credentials
+ */
+static void
+call_refresh(struct rpc_task *task)
+{
+	dprint_status(task);
+
+	task->tk_action = call_refreshresult;
+	task->tk_status = 0;
+	task->tk_client->cl_stats->rpcauthrefresh++;
+	rpcauth_refreshcred(task);
+}
+
+/*
+ * 2a.	Process the results of a credential refresh
+ */
+static void
+call_refreshresult(struct rpc_task *task)
+{
+	int status = task->tk_status;
+
+	dprint_status(task);
+
+	task->tk_status = 0;
+	task->tk_action = call_allocate;
+	if (status >= 0 && rpcauth_uptodatecred(task))
+		return;
+	switch (status) {
+	case -EACCES:
+		rpc_exit(task, -EACCES);
+		return;
+	case -ENOMEM:
+		rpc_exit(task, -ENOMEM);
+		return;
+	case -ETIMEDOUT:
+		rpc_delay(task, 3*HZ);
+	}
+	task->tk_action = call_refresh;
+}
+
+/*
+ * 2b.	Allocate the buffer. For details, see sched.c:rpc_malloc.
  *	(Note: buffer memory is freed in xprt_release).
  */
 static void
 call_allocate(struct rpc_task *task)
 {
-	unsigned int slack = task->tk_msg.rpc_cred->cr_auth->au_cslack;
+	unsigned int slack = task->tk_rqstp->rq_cred->cr_auth->au_cslack;
 	struct rpc_rqst *req = task->tk_rqstp;
 	struct rpc_xprt *xprt = task->tk_xprt;
 	struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
@@ -1472,43 +1592,6 @@ out_retry:
 	}
 }
 
-/*
- * 8.	Refresh the credentials if rejected by the server
- */
-static void
-call_refresh(struct rpc_task *task)
-{
-	dprint_status(task);
-
-	task->tk_action = call_refreshresult;
-	task->tk_status = 0;
-	task->tk_client->cl_stats->rpcauthrefresh++;
-	rpcauth_refreshcred(task);
-}
-
-/*
- * 8a.	Process the results of a credential refresh
- */
-static void
-call_refreshresult(struct rpc_task *task)
-{
-	int status = task->tk_status;
-
-	dprint_status(task);
-
-	task->tk_status = 0;
-	task->tk_action = call_reserve;
-	if (status >= 0 && rpcauth_uptodatecred(task))
-		return;
-	if (status == -EACCES) {
-		rpc_exit(task, -EACCES);
-		return;
-	}
-	task->tk_action = call_refresh;
-	if (status != -ETIMEDOUT)
-		rpc_delay(task, 3*HZ);
-}
-
 static __be32 *
 rpc_encode_header(struct rpc_task *task)
 {
@@ -1593,7 +1676,7 @@ rpc_verify_header(struct rpc_task *task)
 			rpcauth_invalcred(task);
 			/* Ensure we obtain a new XID! */
 			xprt_release(task);
-			task->tk_action = call_refresh;
+			task->tk_action = call_reserve;
 			goto out_retry;
 		case RPC_AUTH_BADCRED:
 		case RPC_AUTH_BADVERF:
diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h
new file mode 100644
index 00000000000..d013bf211ca
--- /dev/null
+++ b/net/sunrpc/netns.h
@@ -0,0 +1,19 @@
+#ifndef __SUNRPC_NETNS_H__
+#define __SUNRPC_NETNS_H__
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+struct cache_detail;
+
+struct sunrpc_net {
+	struct proc_dir_entry *proc_net_rpc;
+	struct cache_detail *ip_map_cache;
+};
+
+extern int sunrpc_net_id;
+
+int ip_map_cache_create(struct net *);
+void ip_map_cache_destroy(struct net *);
+
+#endif
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 95ccbcf45d3..10a17a37ec4 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -27,9 +27,8 @@
 #include <linux/workqueue.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
 #include <linux/sunrpc/cache.h>
-#include <linux/smp_lock.h>
 
-static struct vfsmount *rpc_mount __read_mostly;
+static struct vfsmount *rpc_mnt __read_mostly;
 static int rpc_mount_count;
 
 static struct file_system_type rpc_pipe_fs_type;
@@ -48,7 +47,7 @@ static void rpc_purge_list(struct rpc_inode *rpci, struct list_head *head,
 		return;
 	do {
 		msg = list_entry(head->next, struct rpc_pipe_msg, list);
-		list_del(&msg->list);
+		list_del_init(&msg->list);
 		msg->errno = err;
 		destroy_msg(msg);
 	} while (!list_empty(head));
@@ -204,11 +203,11 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
 	mutex_lock(&inode->i_mutex);
 	if (rpci->ops == NULL)
 		goto out;
-	msg = (struct rpc_pipe_msg *)filp->private_data;
+	msg = filp->private_data;
 	if (msg != NULL) {
 		spin_lock(&inode->i_lock);
 		msg->errno = -EAGAIN;
-		list_del(&msg->list);
+		list_del_init(&msg->list);
 		spin_unlock(&inode->i_lock);
 		rpci->ops->destroy_msg(msg);
 	}
@@ -268,7 +267,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
 	if (res < 0 || msg->len == msg->copied) {
 		filp->private_data = NULL;
 		spin_lock(&inode->i_lock);
-		list_del(&msg->list);
+		list_del_init(&msg->list);
 		spin_unlock(&inode->i_lock);
 		rpci->ops->destroy_msg(msg);
 	}
@@ -309,40 +308,33 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait)
 	return mask;
 }
 
-static int
-rpc_pipe_ioctl_unlocked(struct file *filp, unsigned int cmd, unsigned long arg)
+static long
+rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct rpc_inode *rpci = RPC_I(inode);
 	int len;
 
 	switch (cmd) {
 	case FIONREAD:
-		if (rpci->ops == NULL)
+		spin_lock(&inode->i_lock);
+		if (rpci->ops == NULL) {
+			spin_unlock(&inode->i_lock);
 			return -EPIPE;
+		}
 		len = rpci->pipelen;
 		if (filp->private_data) {
 			struct rpc_pipe_msg *msg;
-			msg = (struct rpc_pipe_msg *)filp->private_data;
+			msg = filp->private_data;
 			len += msg->len - msg->copied;
 		}
+		spin_unlock(&inode->i_lock);
 		return put_user(len, (int __user *)arg);
 	default:
 		return -EINVAL;
 	}
 }
 
-static long
-rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-	long ret;
-
-	lock_kernel();
-	ret = rpc_pipe_ioctl_unlocked(filp, cmd, arg);
-	unlock_kernel();
-
-	return ret;
-}
-
 static const struct file_operations rpc_pipe_fops = {
 	.owner		= THIS_MODULE,
 	.llseek		= no_llseek,
@@ -371,21 +363,23 @@ rpc_show_info(struct seq_file *m, void *v)
 static int
 rpc_info_open(struct inode *inode, struct file *file)
 {
-	struct rpc_clnt *clnt;
+	struct rpc_clnt *clnt = NULL;
 	int ret = single_open(file, rpc_show_info, NULL);
 
 	if (!ret) {
 		struct seq_file *m = file->private_data;
-		mutex_lock(&inode->i_mutex);
-		clnt = RPC_I(inode)->private;
-		if (clnt) {
-			kref_get(&clnt->cl_kref);
+
+		spin_lock(&file->f_path.dentry->d_lock);
+		if (!d_unhashed(file->f_path.dentry))
+			clnt = RPC_I(inode)->private;
+		if (clnt != NULL && atomic_inc_not_zero(&clnt->cl_count)) {
+			spin_unlock(&file->f_path.dentry->d_lock);
 			m->private = clnt;
 		} else {
+			spin_unlock(&file->f_path.dentry->d_lock);
 			single_release(inode, file);
 			ret = -EINVAL;
 		}
-		mutex_unlock(&inode->i_mutex);
 	}
 	return ret;
 }
@@ -423,16 +417,16 @@ struct vfsmount *rpc_get_mount(void)
 {
 	int err;
 
-	err = simple_pin_fs(&rpc_pipe_fs_type, &rpc_mount, &rpc_mount_count);
+	err = simple_pin_fs(&rpc_pipe_fs_type, &rpc_mnt, &rpc_mount_count);
 	if (err != 0)
 		return ERR_PTR(err);
-	return rpc_mount;
+	return rpc_mnt;
 }
 EXPORT_SYMBOL_GPL(rpc_get_mount);
 
 void rpc_put_mount(void)
 {
-	simple_release_fs(&rpc_mount, &rpc_mount_count);
+	simple_release_fs(&rpc_mnt, &rpc_mount_count);
 }
 EXPORT_SYMBOL_GPL(rpc_put_mount);
 
@@ -451,6 +445,7 @@ rpc_get_inode(struct super_block *sb, umode_t mode)
 	struct inode *inode = new_inode(sb);
 	if (!inode)
 		return NULL;
+	inode->i_ino = get_next_ino();
 	inode->i_mode = mode;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	switch(mode & S_IFMT) {
@@ -1023,17 +1018,17 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static int
-rpc_get_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *
+rpc_mount(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *data)
 {
-	return get_sb_single(fs_type, flags, data, rpc_fill_super, mnt);
+	return mount_single(fs_type, flags, data, rpc_fill_super);
 }
 
 static struct file_system_type rpc_pipe_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "rpc_pipefs",
-	.get_sb		= rpc_get_sb,
+	.mount		= rpc_mount,
 	.kill_sb	= kill_litter_super,
 };
 
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index dac219a56ae..fa6d7ca2c85 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -177,6 +177,7 @@ static DEFINE_MUTEX(rpcb_create_local_mutex);
 static int rpcb_create_local(void)
 {
 	struct rpc_create_args args = {
+		.net		= &init_net,
 		.protocol	= XPRT_TRANSPORT_TCP,
 		.address	= (struct sockaddr *)&rpcb_inaddr_loopback,
 		.addrsize	= sizeof(rpcb_inaddr_loopback),
@@ -211,8 +212,9 @@ static int rpcb_create_local(void)
 	 */
 	clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4);
 	if (IS_ERR(clnt4)) {
-		dprintk("RPC:       failed to create local rpcbind v4 "
-				"cleint (errno %ld).\n", PTR_ERR(clnt4));
+		dprintk("RPC:       failed to bind second program to "
+				"rpcbind v4 client (errno %ld).\n",
+				PTR_ERR(clnt4));
 		clnt4 = NULL;
 	}
 
@@ -228,6 +230,7 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
 				    size_t salen, int proto, u32 version)
 {
 	struct rpc_create_args args = {
+		.net		= &init_net,
 		.protocol	= proto,
 		.address	= srvaddr,
 		.addrsize	= salen,
@@ -247,7 +250,7 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
 		((struct sockaddr_in6 *)srvaddr)->sin6_port = htons(RPCBIND_PORT);
 		break;
 	default:
-		return NULL;
+		return ERR_PTR(-EAFNOSUPPORT);
 	}
 
 	return rpc_create(&args);
@@ -475,57 +478,6 @@ int rpcb_v4_register(const u32 program, const u32 version,
 	return -EAFNOSUPPORT;
 }
 
-/**
- * rpcb_getport_sync - obtain the port for an RPC service on a given host
- * @sin: address of remote peer
- * @prog: RPC program number to bind
- * @vers: RPC version number to bind
- * @prot: transport protocol to use to make this request
- *
- * Return value is the requested advertised port number,
- * or a negative errno value.
- *
- * Called from outside the RPC client in a synchronous task context.
- * Uses default timeout parameters specified by underlying transport.
- *
- * XXX: Needs to support IPv6
- */
-int rpcb_getport_sync(struct sockaddr_in *sin, u32 prog, u32 vers, int prot)
-{
-	struct rpcbind_args map = {
-		.r_prog		= prog,
-		.r_vers		= vers,
-		.r_prot		= prot,
-		.r_port		= 0,
-	};
-	struct rpc_message msg = {
-		.rpc_proc	= &rpcb_procedures2[RPCBPROC_GETPORT],
-		.rpc_argp	= &map,
-		.rpc_resp	= &map,
-	};
-	struct rpc_clnt	*rpcb_clnt;
-	int status;
-
-	dprintk("RPC:       %s(%pI4, %u, %u, %d)\n",
-		__func__, &sin->sin_addr.s_addr, prog, vers, prot);
-
-	rpcb_clnt = rpcb_create(NULL, (struct sockaddr *)sin,
-				sizeof(*sin), prot, RPCBVERS_2);
-	if (IS_ERR(rpcb_clnt))
-		return PTR_ERR(rpcb_clnt);
-
-	status = rpc_call_sync(rpcb_clnt, &msg, 0);
-	rpc_shutdown_client(rpcb_clnt);
-
-	if (status >= 0) {
-		if (map.r_port != 0)
-			return map.r_port;
-		status = -EACCES;
-	}
-	return status;
-}
-EXPORT_SYMBOL_GPL(rpcb_getport_sync);
-
 static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbind_args *map, struct rpc_procinfo *proc)
 {
 	struct rpc_message msg = {
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 4a843b883b8..243fc09b164 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -246,17 +246,8 @@ static inline void rpc_task_set_debuginfo(struct rpc_task *task)
 
 static void rpc_set_active(struct rpc_task *task)
 {
-	struct rpc_clnt *clnt;
-	if (test_and_set_bit(RPC_TASK_ACTIVE, &task->tk_runstate) != 0)
-		return;
 	rpc_task_set_debuginfo(task);
-	/* Add to global list of all tasks */
-	clnt = task->tk_client;
-	if (clnt != NULL) {
-		spin_lock(&clnt->cl_lock);
-		list_add_tail(&task->tk_task, &clnt->cl_tasks);
-		spin_unlock(&clnt->cl_lock);
-	}
+	set_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
 }
 
 /*
@@ -319,11 +310,6 @@ static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
 	dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n",
 			task->tk_pid, rpc_qname(q), jiffies);
 
-	if (!RPC_IS_ASYNC(task) && !RPC_IS_ACTIVATED(task)) {
-		printk(KERN_ERR "RPC: Inactive synchronous task put to sleep!\n");
-		return;
-	}
-
 	__rpc_add_wait_queue(q, task);
 
 	BUG_ON(task->tk_callback != NULL);
@@ -334,8 +320,8 @@ static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
 void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
 				rpc_action action)
 {
-	/* Mark the task as being activated if so needed */
-	rpc_set_active(task);
+	/* We shouldn't ever put an inactive task to sleep */
+	BUG_ON(!RPC_IS_ACTIVATED(task));
 
 	/*
 	 * Protect the queue operations.
@@ -390,7 +376,7 @@ int rpc_queue_empty(struct rpc_wait_queue *queue)
 	spin_lock_bh(&queue->lock);
 	res = queue->qlen;
 	spin_unlock_bh(&queue->lock);
-	return (res == 0);
+	return res == 0;
 }
 EXPORT_SYMBOL_GPL(rpc_queue_empty);
 
@@ -406,14 +392,6 @@ void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task
 EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task);
 
 /*
- * Wake up the specified task
- */
-static void rpc_wake_up_task(struct rpc_task *task)
-{
-	rpc_wake_up_queued_task(task->tk_waitqueue, task);
-}
-
-/*
  * Wake up the next task on a priority queue.
  */
 static struct rpc_task * __rpc_wake_up_next_priority(struct rpc_wait_queue *queue)
@@ -600,7 +578,15 @@ void rpc_exit_task(struct rpc_task *task)
 		}
 	}
 }
-EXPORT_SYMBOL_GPL(rpc_exit_task);
+
+void rpc_exit(struct rpc_task *task, int status)
+{
+	task->tk_status = status;
+	task->tk_action = rpc_exit_task;
+	if (RPC_IS_QUEUED(task))
+		rpc_wake_up_queued_task(task->tk_waitqueue, task);
+}
+EXPORT_SYMBOL_GPL(rpc_exit);
 
 void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata)
 {
@@ -690,7 +676,6 @@ static void __rpc_execute(struct rpc_task *task)
 			dprintk("RPC: %5u got signal\n", task->tk_pid);
 			task->tk_flags |= RPC_TASK_KILLED;
 			rpc_exit(task, -ERESTARTSYS);
-			rpc_wake_up_task(task);
 		}
 		rpc_set_running(task);
 		dprintk("RPC: %5u sync task resuming\n", task->tk_pid);
@@ -714,8 +699,9 @@ static void __rpc_execute(struct rpc_task *task)
 void rpc_execute(struct rpc_task *task)
 {
 	rpc_set_active(task);
-	rpc_set_running(task);
-	__rpc_execute(task);
+	rpc_make_runnable(task);
+	if (!RPC_IS_ASYNC(task))
+		__rpc_execute(task);
 }
 
 static void rpc_async_schedule(struct work_struct *work)
@@ -808,26 +794,9 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
 	/* Initialize workqueue for async tasks */
 	task->tk_workqueue = task_setup_data->workqueue;
 
-	task->tk_client = task_setup_data->rpc_client;
-	if (task->tk_client != NULL) {
-		kref_get(&task->tk_client->cl_kref);
-		if (task->tk_client->cl_softrtry)
-			task->tk_flags |= RPC_TASK_SOFT;
-	}
-
 	if (task->tk_ops->rpc_call_prepare != NULL)
 		task->tk_action = rpc_prepare_task;
 
-	if (task_setup_data->rpc_message != NULL) {
-		task->tk_msg.rpc_proc = task_setup_data->rpc_message->rpc_proc;
-		task->tk_msg.rpc_argp = task_setup_data->rpc_message->rpc_argp;
-		task->tk_msg.rpc_resp = task_setup_data->rpc_message->rpc_resp;
-		/* Bind the user cred */
-		rpcauth_bindcred(task, task_setup_data->rpc_message->rpc_cred, task_setup_data->flags);
-		if (task->tk_action == NULL)
-			rpc_call_start(task);
-	}
-
 	/* starting timestamp */
 	task->tk_start = ktime_get();
 
@@ -896,11 +865,8 @@ void rpc_put_task(struct rpc_task *task)
 	if (task->tk_rqstp)
 		xprt_release(task);
 	if (task->tk_msg.rpc_cred)
-		rpcauth_unbindcred(task);
-	if (task->tk_client) {
-		rpc_release_client(task->tk_client);
-		task->tk_client = NULL;
-	}
+		put_rpccred(task->tk_msg.rpc_cred);
+	rpc_task_release_client(task);
 	if (task->tk_workqueue != NULL) {
 		INIT_WORK(&task->u.tk_work, rpc_async_release);
 		queue_work(task->tk_workqueue, &task->u.tk_work);
@@ -913,13 +879,6 @@ static void rpc_release_task(struct rpc_task *task)
 {
 	dprintk("RPC: %5u release task\n", task->tk_pid);
 
-	if (!list_empty(&task->tk_task)) {
-		struct rpc_clnt *clnt = task->tk_client;
-		/* Remove from client task list */
-		spin_lock(&clnt->cl_lock);
-		list_del(&task->tk_task);
-		spin_unlock(&clnt->cl_lock);
-	}
 	BUG_ON (RPC_IS_QUEUED(task));
 
 	/* Wake up anyone who is waiting for task completion */
@@ -928,35 +887,6 @@ static void rpc_release_task(struct rpc_task *task)
 	rpc_put_task(task);
 }
 
-/*
- * Kill all tasks for the given client.
- * XXX: kill their descendants as well?
- */
-void rpc_killall_tasks(struct rpc_clnt *clnt)
-{
-	struct rpc_task	*rovr;
-
-
-	if (list_empty(&clnt->cl_tasks))
-		return;
-	dprintk("RPC:       killing all tasks for client %p\n", clnt);
-	/*
-	 * Spin lock all_tasks to prevent changes...
-	 */
-	spin_lock(&clnt->cl_lock);
-	list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) {
-		if (! RPC_IS_ACTIVATED(rovr))
-			continue;
-		if (!(rovr->tk_flags & RPC_TASK_KILLED)) {
-			rovr->tk_flags |= RPC_TASK_KILLED;
-			rpc_exit(rovr, -EIO);
-			rpc_wake_up_task(rovr);
-		}
-	}
-	spin_unlock(&clnt->cl_lock);
-}
-EXPORT_SYMBOL_GPL(rpc_killall_tasks);
-
 int rpciod_up(void)
 {
 	return try_module_get(THIS_MODULE) ? 0 : -EINVAL;
@@ -978,7 +908,7 @@ static int rpciod_start(void)
 	 * Create the rpciod thread and wait for it to start.
 	 */
 	dprintk("RPC:       creating workqueue rpciod\n");
-	wq = create_workqueue("rpciod");
+	wq = alloc_workqueue("rpciod", WQ_RESCUER, 0);
 	rpciod_workqueue = wq;
 	return rpciod_workqueue != NULL;
 }
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index ea1046f3f9a..f71a73107ae 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -22,11 +22,10 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/metrics.h>
-#include <net/net_namespace.h>
 
-#define RPCDBG_FACILITY	RPCDBG_MISC
+#include "netns.h"
 
-struct proc_dir_entry	*proc_net_rpc = NULL;
+#define RPCDBG_FACILITY	RPCDBG_MISC
 
 /*
  * Get RPC client stats
@@ -218,10 +217,11 @@ EXPORT_SYMBOL_GPL(rpc_print_iostats);
 static inline struct proc_dir_entry *
 do_register(const char *name, void *data, const struct file_operations *fops)
 {
-	rpc_proc_init();
-	dprintk("RPC:       registering /proc/net/rpc/%s\n", name);
+	struct sunrpc_net *sn;
 
-	return proc_create_data(name, 0, proc_net_rpc, fops, data);
+	dprintk("RPC:       registering /proc/net/rpc/%s\n", name);
+	sn = net_generic(&init_net, sunrpc_net_id);
+	return proc_create_data(name, 0, sn->proc_net_rpc, fops, data);
 }
 
 struct proc_dir_entry *
@@ -234,7 +234,10 @@ EXPORT_SYMBOL_GPL(rpc_proc_register);
 void
 rpc_proc_unregister(const char *name)
 {
-	remove_proc_entry(name, proc_net_rpc);
+	struct sunrpc_net *sn;
+
+	sn = net_generic(&init_net, sunrpc_net_id);
+	remove_proc_entry(name, sn->proc_net_rpc);
 }
 EXPORT_SYMBOL_GPL(rpc_proc_unregister);
 
@@ -248,25 +251,29 @@ EXPORT_SYMBOL_GPL(svc_proc_register);
 void
 svc_proc_unregister(const char *name)
 {
-	remove_proc_entry(name, proc_net_rpc);
+	struct sunrpc_net *sn;
+
+	sn = net_generic(&init_net, sunrpc_net_id);
+	remove_proc_entry(name, sn->proc_net_rpc);
 }
 EXPORT_SYMBOL_GPL(svc_proc_unregister);
 
-void
-rpc_proc_init(void)
+int rpc_proc_init(struct net *net)
 {
+	struct sunrpc_net *sn;
+
 	dprintk("RPC:       registering /proc/net/rpc\n");
-	if (!proc_net_rpc)
-		proc_net_rpc = proc_mkdir("rpc", init_net.proc_net);
+	sn = net_generic(net, sunrpc_net_id);
+	sn->proc_net_rpc = proc_mkdir("rpc", net->proc_net);
+	if (sn->proc_net_rpc == NULL)
+		return -ENOMEM;
+
+	return 0;
 }
 
-void
-rpc_proc_exit(void)
+void rpc_proc_exit(struct net *net)
 {
 	dprintk("RPC:       unregistering /proc/net/rpc\n");
-	if (proc_net_rpc) {
-		proc_net_rpc = NULL;
-		remove_proc_entry("rpc", init_net.proc_net);
-	}
+	remove_proc_entry("rpc", net->proc_net);
 }
 
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index f438347d817..9d080916099 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -22,7 +22,44 @@
 #include <linux/sunrpc/rpc_pipe_fs.h>
 #include <linux/sunrpc/xprtsock.h>
 
-extern struct cache_detail ip_map_cache, unix_gid_cache;
+#include "netns.h"
+
+int sunrpc_net_id;
+
+static __net_init int sunrpc_init_net(struct net *net)
+{
+	int err;
+
+	err = rpc_proc_init(net);
+	if (err)
+		goto err_proc;
+
+	err = ip_map_cache_create(net);
+	if (err)
+		goto err_ipmap;
+
+	return 0;
+
+err_ipmap:
+	rpc_proc_exit(net);
+err_proc:
+	return err;
+}
+
+static __net_exit void sunrpc_exit_net(struct net *net)
+{
+	ip_map_cache_destroy(net);
+	rpc_proc_exit(net);
+}
+
+static struct pernet_operations sunrpc_net_ops = {
+	.init = sunrpc_init_net,
+	.exit = sunrpc_exit_net,
+	.id = &sunrpc_net_id,
+	.size = sizeof(struct sunrpc_net),
+};
+
+extern struct cache_detail unix_gid_cache;
 
 extern void cleanup_rpcb_clnt(void);
 
@@ -33,21 +70,31 @@ init_sunrpc(void)
 	if (err)
 		goto out;
 	err = rpc_init_mempool();
-	if (err) {
-		unregister_rpc_pipefs();
-		goto out;
-	}
+	if (err)
+		goto out2;
+	err = rpcauth_init_module();
+	if (err)
+		goto out3;
+
+	cache_initialize();
+
+	err = register_pernet_subsys(&sunrpc_net_ops);
+	if (err)
+		goto out4;
 #ifdef RPC_DEBUG
 	rpc_register_sysctl();
 #endif
-#ifdef CONFIG_PROC_FS
-	rpc_proc_init();
-#endif
-	cache_register(&ip_map_cache);
 	cache_register(&unix_gid_cache);
 	svc_init_xprt_sock();	/* svc sock transport */
 	init_socket_xprt();	/* clnt sock transport */
-	rpcauth_init_module();
+	return 0;
+
+out4:
+	rpcauth_remove_module();
+out3:
+	rpc_destroy_mempool();
+out2:
+	unregister_rpc_pipefs();
 out:
 	return err;
 }
@@ -61,14 +108,11 @@ cleanup_sunrpc(void)
 	svc_cleanup_xprt_sock();
 	unregister_rpc_pipefs();
 	rpc_destroy_mempool();
-	cache_unregister(&ip_map_cache);
 	cache_unregister(&unix_gid_cache);
+	unregister_pernet_subsys(&sunrpc_net_ops);
 #ifdef RPC_DEBUG
 	rpc_unregister_sysctl();
 #endif
-#ifdef CONFIG_PROC_FS
-	rpc_proc_exit();
-#endif
 	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 }
 MODULE_LICENSE("GPL");
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index d9017d64597..6359c42c494 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1055,6 +1055,9 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 		goto err_bad;
 	case SVC_DENIED:
 		goto err_bad_auth;
+	case SVC_CLOSE:
+		if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
+			svc_close_xprt(rqstp->rq_xprt);
 	case SVC_DROP:
 		goto dropit;
 	case SVC_COMPLETE:
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index cbc084939dd..c82fe739fbd 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -100,16 +100,14 @@ EXPORT_SYMBOL_GPL(svc_unreg_xprt_class);
  */
 int svc_print_xprts(char *buf, int maxlen)
 {
-	struct list_head *le;
+	struct svc_xprt_class *xcl;
 	char tmpstr[80];
 	int len = 0;
 	buf[0] = '\0';
 
 	spin_lock(&svc_xprt_class_lock);
-	list_for_each(le, &svc_xprt_class_list) {
+	list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
 		int slen;
-		struct svc_xprt_class *xcl =
-			list_entry(le, struct svc_xprt_class, xcl_list);
 
 		sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload);
 		slen = strlen(tmpstr);
@@ -128,9 +126,9 @@ static void svc_xprt_free(struct kref *kref)
 	struct svc_xprt *xprt =
 		container_of(kref, struct svc_xprt, xpt_ref);
 	struct module *owner = xprt->xpt_class->xcl_owner;
-	if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags) &&
-	    xprt->xpt_auth_cache != NULL)
-		svcauth_unix_info_release(xprt->xpt_auth_cache);
+	if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags))
+		svcauth_unix_info_release(xprt);
+	put_net(xprt->xpt_net);
 	xprt->xpt_ops->xpo_free(xprt);
 	module_put(owner);
 }
@@ -156,15 +154,18 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt,
 	INIT_LIST_HEAD(&xprt->xpt_list);
 	INIT_LIST_HEAD(&xprt->xpt_ready);
 	INIT_LIST_HEAD(&xprt->xpt_deferred);
+	INIT_LIST_HEAD(&xprt->xpt_users);
 	mutex_init(&xprt->xpt_mutex);
 	spin_lock_init(&xprt->xpt_lock);
 	set_bit(XPT_BUSY, &xprt->xpt_flags);
 	rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending");
+	xprt->xpt_net = get_net(&init_net);
 }
 EXPORT_SYMBOL_GPL(svc_xprt_init);
 
 static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 					 struct svc_serv *serv,
+					 struct net *net,
 					 const int family,
 					 const unsigned short port,
 					 int flags)
@@ -199,12 +200,12 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 		return ERR_PTR(-EAFNOSUPPORT);
 	}
 
-	return xcl->xcl_ops->xpo_create(serv, sap, len, flags);
+	return xcl->xcl_ops->xpo_create(serv, net, sap, len, flags);
 }
 
 int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
-		    const int family, const unsigned short port,
-		    int flags)
+		    struct net *net, const int family,
+		    const unsigned short port, int flags)
 {
 	struct svc_xprt_class *xcl;
 
@@ -220,7 +221,7 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
 			goto err;
 
 		spin_unlock(&svc_xprt_class_lock);
-		newxprt = __svc_xpo_create(xcl, serv, family, port, flags);
+		newxprt = __svc_xpo_create(xcl, serv, net, family, port, flags);
 		if (IS_ERR(newxprt)) {
 			module_put(xcl->xcl_owner);
 			return PTR_ERR(newxprt);
@@ -329,12 +330,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 		       "svc_xprt_enqueue: "
 		       "threads and transports both waiting??\n");
 
-	if (test_bit(XPT_DEAD, &xprt->xpt_flags)) {
-		/* Don't enqueue dead transports */
-		dprintk("svc: transport %p is dead, not enqueued\n", xprt);
-		goto out_unlock;
-	}
-
 	pool->sp_stats.packets++;
 
 	/* Mark transport as busy. It will remain in this state until
@@ -651,6 +646,11 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 	if (signalled() || kthread_should_stop())
 		return -EINTR;
 
+	/* Normally we will wait up to 5 seconds for any required
+	 * cache information to be provided.
+	 */
+	rqstp->rq_chandle.thread_wait = 5*HZ;
+
 	spin_lock_bh(&pool->sp_lock);
 	xprt = svc_xprt_dequeue(pool);
 	if (xprt) {
@@ -658,6 +658,12 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 		svc_xprt_get(xprt);
 		rqstp->rq_reserved = serv->sv_max_mesg;
 		atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+
+		/* As there is a shortage of threads and this request
+		 * had to be queued, don't allow the thread to wait so
+		 * long for cache updates.
+		 */
+		rqstp->rq_chandle.thread_wait = 1*HZ;
 	} else {
 		/* No data pending. Go to sleep */
 		svc_thread_enqueue(pool, rqstp);
@@ -868,6 +874,19 @@ static void svc_age_temp_xprts(unsigned long closure)
 	mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
 }
 
+static void call_xpt_users(struct svc_xprt *xprt)
+{
+	struct svc_xpt_user *u;
+
+	spin_lock(&xprt->xpt_lock);
+	while (!list_empty(&xprt->xpt_users)) {
+		u = list_first_entry(&xprt->xpt_users, struct svc_xpt_user, list);
+		list_del(&u->list);
+		u->callback(u);
+	}
+	spin_unlock(&xprt->xpt_lock);
+}
+
 /*
  * Remove a dead transport
  */
@@ -878,7 +897,7 @@ void svc_delete_xprt(struct svc_xprt *xprt)
 
 	/* Only do this once */
 	if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags))
-		return;
+		BUG();
 
 	dprintk("svc: svc_delete_xprt(%p)\n", xprt);
 	xprt->xpt_ops->xpo_detach(xprt);
@@ -900,6 +919,7 @@ void svc_delete_xprt(struct svc_xprt *xprt)
 	while ((dr = svc_deferred_dequeue(xprt)) != NULL)
 		kfree(dr);
 
+	call_xpt_users(xprt);
 	svc_xprt_put(xprt);
 }
 
@@ -910,10 +930,7 @@ void svc_close_xprt(struct svc_xprt *xprt)
 		/* someone else will have to effect the close */
 		return;
 
-	svc_xprt_get(xprt);
 	svc_delete_xprt(xprt);
-	clear_bit(XPT_BUSY, &xprt->xpt_flags);
-	svc_xprt_put(xprt);
 }
 EXPORT_SYMBOL_GPL(svc_close_xprt);
 
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 20731161098..560677d187f 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -18,6 +18,8 @@
 
 #include <linux/sunrpc/clnt.h>
 
+#include "netns.h"
+
 /*
  * AUTHUNIX and AUTHNULL credentials are both handled here.
  * AUTHNULL is treated just like AUTHUNIX except that the uid/gid
@@ -92,7 +94,6 @@ struct ip_map {
 	struct unix_domain	*m_client;
 	int			m_add_change;
 };
-static struct cache_head	*ip_table[IP_HASHMAX];
 
 static void ip_map_put(struct kref *kref)
 {
@@ -178,8 +179,8 @@ static int ip_map_upcall(struct cache_detail *cd, struct cache_head *h)
 	return sunrpc_cache_pipe_upcall(cd, h, ip_map_request);
 }
 
-static struct ip_map *ip_map_lookup(char *class, struct in6_addr *addr);
-static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t expiry);
+static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class, struct in6_addr *addr);
+static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm, struct unix_domain *udom, time_t expiry);
 
 static int ip_map_parse(struct cache_detail *cd,
 			  char *mesg, int mlen)
@@ -219,10 +220,9 @@ static int ip_map_parse(struct cache_detail *cd,
 	switch (address.sa.sa_family) {
 	case AF_INET:
 		/* Form a mapped IPv4 address in sin6 */
-		memset(&sin6, 0, sizeof(sin6));
 		sin6.sin6_family = AF_INET6;
-		sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
-		sin6.sin6_addr.s6_addr32[3] = address.s4.sin_addr.s_addr;
+		ipv6_addr_set_v4mapped(address.s4.sin_addr.s_addr,
+				&sin6.sin6_addr);
 		break;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	case AF_INET6:
@@ -249,9 +249,9 @@ static int ip_map_parse(struct cache_detail *cd,
 		dom = NULL;
 
 	/* IPv6 scope IDs are ignored for now */
-	ipmp = ip_map_lookup(class, &sin6.sin6_addr);
+	ipmp = __ip_map_lookup(cd, class, &sin6.sin6_addr);
 	if (ipmp) {
-		err = ip_map_update(ipmp,
+		err = __ip_map_update(cd, ipmp,
 			     container_of(dom, struct unix_domain, h),
 			     expiry);
 	} else
@@ -294,29 +294,15 @@ static int ip_map_show(struct seq_file *m,
 }
 
 
-struct cache_detail ip_map_cache = {
-	.owner		= THIS_MODULE,
-	.hash_size	= IP_HASHMAX,
-	.hash_table	= ip_table,
-	.name		= "auth.unix.ip",
-	.cache_put	= ip_map_put,
-	.cache_upcall	= ip_map_upcall,
-	.cache_parse	= ip_map_parse,
-	.cache_show	= ip_map_show,
-	.match		= ip_map_match,
-	.init		= ip_map_init,
-	.update		= update,
-	.alloc		= ip_map_alloc,
-};
-
-static struct ip_map *ip_map_lookup(char *class, struct in6_addr *addr)
+static struct ip_map *__ip_map_lookup(struct cache_detail *cd, char *class,
+		struct in6_addr *addr)
 {
 	struct ip_map ip;
 	struct cache_head *ch;
 
 	strcpy(ip.m_class, class);
 	ipv6_addr_copy(&ip.m_addr, addr);
-	ch = sunrpc_cache_lookup(&ip_map_cache, &ip.h,
+	ch = sunrpc_cache_lookup(cd, &ip.h,
 				 hash_str(class, IP_HASHBITS) ^
 				 hash_ip6(*addr));
 
@@ -326,7 +312,17 @@ static struct ip_map *ip_map_lookup(char *class, struct in6_addr *addr)
 		return NULL;
 }
 
-static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t expiry)
+static inline struct ip_map *ip_map_lookup(struct net *net, char *class,
+		struct in6_addr *addr)
+{
+	struct sunrpc_net *sn;
+
+	sn = net_generic(net, sunrpc_net_id);
+	return __ip_map_lookup(sn->ip_map_cache, class, addr);
+}
+
+static int __ip_map_update(struct cache_detail *cd, struct ip_map *ipm,
+		struct unix_domain *udom, time_t expiry)
 {
 	struct ip_map ip;
 	struct cache_head *ch;
@@ -344,17 +340,25 @@ static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t ex
 			ip.m_add_change++;
 	}
 	ip.h.expiry_time = expiry;
-	ch = sunrpc_cache_update(&ip_map_cache,
-				 &ip.h, &ipm->h,
+	ch = sunrpc_cache_update(cd, &ip.h, &ipm->h,
 				 hash_str(ipm->m_class, IP_HASHBITS) ^
 				 hash_ip6(ipm->m_addr));
 	if (!ch)
 		return -ENOMEM;
-	cache_put(ch, &ip_map_cache);
+	cache_put(ch, cd);
 	return 0;
 }
 
-int auth_unix_add_addr(struct in6_addr *addr, struct auth_domain *dom)
+static inline int ip_map_update(struct net *net, struct ip_map *ipm,
+		struct unix_domain *udom, time_t expiry)
+{
+	struct sunrpc_net *sn;
+
+	sn = net_generic(net, sunrpc_net_id);
+	return __ip_map_update(sn->ip_map_cache, ipm, udom, expiry);
+}
+
+int auth_unix_add_addr(struct net *net, struct in6_addr *addr, struct auth_domain *dom)
 {
 	struct unix_domain *udom;
 	struct ip_map *ipmp;
@@ -362,10 +366,10 @@ int auth_unix_add_addr(struct in6_addr *addr, struct auth_domain *dom)
 	if (dom->flavour != &svcauth_unix)
 		return -EINVAL;
 	udom = container_of(dom, struct unix_domain, h);
-	ipmp = ip_map_lookup("nfsd", addr);
+	ipmp = ip_map_lookup(net, "nfsd", addr);
 
 	if (ipmp)
-		return ip_map_update(ipmp, udom, NEVER);
+		return ip_map_update(net, ipmp, udom, NEVER);
 	else
 		return -ENOMEM;
 }
@@ -383,16 +387,18 @@ int auth_unix_forget_old(struct auth_domain *dom)
 }
 EXPORT_SYMBOL_GPL(auth_unix_forget_old);
 
-struct auth_domain *auth_unix_lookup(struct in6_addr *addr)
+struct auth_domain *auth_unix_lookup(struct net *net, struct in6_addr *addr)
 {
 	struct ip_map *ipm;
 	struct auth_domain *rv;
+	struct sunrpc_net *sn;
 
-	ipm = ip_map_lookup("nfsd", addr);
+	sn = net_generic(net, sunrpc_net_id);
+	ipm = ip_map_lookup(net, "nfsd", addr);
 
 	if (!ipm)
 		return NULL;
-	if (cache_check(&ip_map_cache, &ipm->h, NULL))
+	if (cache_check(sn->ip_map_cache, &ipm->h, NULL))
 		return NULL;
 
 	if ((ipm->m_client->addr_changes - ipm->m_add_change) >0) {
@@ -403,22 +409,29 @@ struct auth_domain *auth_unix_lookup(struct in6_addr *addr)
 		rv = &ipm->m_client->h;
 		kref_get(&rv->ref);
 	}
-	cache_put(&ipm->h, &ip_map_cache);
+	cache_put(&ipm->h, sn->ip_map_cache);
 	return rv;
 }
 EXPORT_SYMBOL_GPL(auth_unix_lookup);
 
 void svcauth_unix_purge(void)
 {
-	cache_purge(&ip_map_cache);
+	struct net *net;
+
+	for_each_net(net) {
+		struct sunrpc_net *sn;
+
+		sn = net_generic(net, sunrpc_net_id);
+		cache_purge(sn->ip_map_cache);
+	}
 }
 EXPORT_SYMBOL_GPL(svcauth_unix_purge);
 
 static inline struct ip_map *
-ip_map_cached_get(struct svc_rqst *rqstp)
+ip_map_cached_get(struct svc_xprt *xprt)
 {
 	struct ip_map *ipm = NULL;
-	struct svc_xprt *xprt = rqstp->rq_xprt;
+	struct sunrpc_net *sn;
 
 	if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
 		spin_lock(&xprt->xpt_lock);
@@ -430,9 +443,10 @@ ip_map_cached_get(struct svc_rqst *rqstp)
 				 * remembered, e.g. by a second mount from the
 				 * same IP address.
 				 */
+				sn = net_generic(xprt->xpt_net, sunrpc_net_id);
 				xprt->xpt_auth_cache = NULL;
 				spin_unlock(&xprt->xpt_lock);
-				cache_put(&ipm->h, &ip_map_cache);
+				cache_put(&ipm->h, sn->ip_map_cache);
 				return NULL;
 			}
 			cache_get(&ipm->h);
@@ -443,10 +457,8 @@ ip_map_cached_get(struct svc_rqst *rqstp)
 }
 
 static inline void
-ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm)
+ip_map_cached_put(struct svc_xprt *xprt, struct ip_map *ipm)
 {
-	struct svc_xprt *xprt = rqstp->rq_xprt;
-
 	if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
 		spin_lock(&xprt->xpt_lock);
 		if (xprt->xpt_auth_cache == NULL) {
@@ -456,15 +468,26 @@ ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm)
 		}
 		spin_unlock(&xprt->xpt_lock);
 	}
-	if (ipm)
-		cache_put(&ipm->h, &ip_map_cache);
+	if (ipm) {
+		struct sunrpc_net *sn;
+
+		sn = net_generic(xprt->xpt_net, sunrpc_net_id);
+		cache_put(&ipm->h, sn->ip_map_cache);
+	}
 }
 
 void
-svcauth_unix_info_release(void *info)
+svcauth_unix_info_release(struct svc_xprt *xpt)
 {
-	struct ip_map *ipm = info;
-	cache_put(&ipm->h, &ip_map_cache);
+	struct ip_map *ipm;
+
+	ipm = xpt->xpt_auth_cache;
+	if (ipm != NULL) {
+		struct sunrpc_net *sn;
+
+		sn = net_generic(xpt->xpt_net, sunrpc_net_id);
+		cache_put(&ipm->h, sn->ip_map_cache);
+	}
 }
 
 /****************************************************************************
@@ -674,6 +697,8 @@ static struct group_info *unix_gid_find(uid_t uid, struct svc_rqst *rqstp)
 	switch (ret) {
 	case -ENOENT:
 		return ERR_PTR(-ENOENT);
+	case -ETIMEDOUT:
+		return ERR_PTR(-ESHUTDOWN);
 	case 0:
 		gi = get_group_info(ug->gi);
 		cache_put(&ug->h, &unix_gid_cache);
@@ -691,6 +716,9 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
 	struct ip_map *ipm;
 	struct group_info *gi;
 	struct svc_cred *cred = &rqstp->rq_cred;
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+	struct net *net = xprt->xpt_net;
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
 
 	switch (rqstp->rq_addr.ss_family) {
 	case AF_INET:
@@ -709,26 +737,27 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
 	if (rqstp->rq_proc == 0)
 		return SVC_OK;
 
-	ipm = ip_map_cached_get(rqstp);
+	ipm = ip_map_cached_get(xprt);
 	if (ipm == NULL)
-		ipm = ip_map_lookup(rqstp->rq_server->sv_program->pg_class,
+		ipm = __ip_map_lookup(sn->ip_map_cache, rqstp->rq_server->sv_program->pg_class,
 				    &sin6->sin6_addr);
 
 	if (ipm == NULL)
 		return SVC_DENIED;
 
-	switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) {
+	switch (cache_check(sn->ip_map_cache, &ipm->h, &rqstp->rq_chandle)) {
 		default:
 			BUG();
-		case -EAGAIN:
 		case -ETIMEDOUT:
+			return SVC_CLOSE;
+		case -EAGAIN:
 			return SVC_DROP;
 		case -ENOENT:
 			return SVC_DENIED;
 		case 0:
 			rqstp->rq_client = &ipm->m_client->h;
 			kref_get(&rqstp->rq_client->ref);
-			ip_map_cached_put(rqstp, ipm);
+			ip_map_cached_put(xprt, ipm);
 			break;
 	}
 
@@ -736,6 +765,8 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
 	switch (PTR_ERR(gi)) {
 	case -EAGAIN:
 		return SVC_DROP;
+	case -ESHUTDOWN:
+		return SVC_CLOSE;
 	case -ENOENT:
 		break;
 	default:
@@ -776,7 +807,7 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
 	cred->cr_gid = (gid_t) -1;
 	cred->cr_group_info = groups_alloc(0);
 	if (cred->cr_group_info == NULL)
-		return SVC_DROP; /* kmalloc failure - client must retry */
+		return SVC_CLOSE; /* kmalloc failure - client must retry */
 
 	/* Put NULL verifier */
 	svc_putnl(resv, RPC_AUTH_NULL);
@@ -840,7 +871,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
 		goto badcred;
 	cred->cr_group_info = groups_alloc(slen);
 	if (cred->cr_group_info == NULL)
-		return SVC_DROP;
+		return SVC_CLOSE;
 	for (i = 0; i < slen; i++)
 		GROUP_AT(cred->cr_group_info, i) = svc_getnl(argv);
 	if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
@@ -886,3 +917,56 @@ struct auth_ops svcauth_unix = {
 	.set_client	= svcauth_unix_set_client,
 };
 
+int ip_map_cache_create(struct net *net)
+{
+	int err = -ENOMEM;
+	struct cache_detail *cd;
+	struct cache_head **tbl;
+	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+	cd = kzalloc(sizeof(struct cache_detail), GFP_KERNEL);
+	if (cd == NULL)
+		goto err_cd;
+
+	tbl = kzalloc(IP_HASHMAX * sizeof(struct cache_head *), GFP_KERNEL);
+	if (tbl == NULL)
+		goto err_tbl;
+
+	cd->owner = THIS_MODULE,
+	cd->hash_size = IP_HASHMAX,
+	cd->hash_table = tbl,
+	cd->name = "auth.unix.ip",
+	cd->cache_put = ip_map_put,
+	cd->cache_upcall = ip_map_upcall,
+	cd->cache_parse = ip_map_parse,
+	cd->cache_show = ip_map_show,
+	cd->match = ip_map_match,
+	cd->init = ip_map_init,
+	cd->update = update,
+	cd->alloc = ip_map_alloc,
+
+	err = cache_register_net(cd, net);
+	if (err)
+		goto err_reg;
+
+	sn->ip_map_cache = cd;
+	return 0;
+
+err_reg:
+	kfree(tbl);
+err_tbl:
+	kfree(cd);
+err_cd:
+	return err;
+}
+
+void ip_map_cache_destroy(struct net *net)
+{
+	struct sunrpc_net *sn;
+
+	sn = net_generic(net, sunrpc_net_id);
+	cache_purge(sn->ip_map_cache);
+	cache_unregister_net(sn->ip_map_cache, net);
+	kfree(sn->ip_map_cache->hash_table);
+	kfree(sn->ip_map_cache);
+}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 7e534dd0907..07919e16be3 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -64,7 +64,8 @@ static void		svc_tcp_sock_detach(struct svc_xprt *);
 static void		svc_sock_free(struct svc_xprt *);
 
 static struct svc_xprt *svc_create_socket(struct svc_serv *, int,
-					  struct sockaddr *, int, int);
+					  struct net *, struct sockaddr *,
+					  int, int);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key svc_key[2];
 static struct lock_class_key svc_slock_key[2];
@@ -657,10 +658,11 @@ static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
 }
 
 static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
+				       struct net *net,
 				       struct sockaddr *sa, int salen,
 				       int flags)
 {
-	return svc_create_socket(serv, IPPROTO_UDP, sa, salen, flags);
+	return svc_create_socket(serv, IPPROTO_UDP, net, sa, salen, flags);
 }
 
 static struct svc_xprt_ops svc_udp_ops = {
@@ -1133,9 +1135,6 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
 	reclen = htonl(0x80000000|((xbufp->len ) - 4));
 	memcpy(xbufp->head[0].iov_base, &reclen, 4);
 
-	if (test_bit(XPT_DEAD, &rqstp->rq_xprt->xpt_flags))
-		return -ENOTCONN;
-
 	sent = svc_sendto(rqstp, &rqstp->rq_res);
 	if (sent != xbufp->len) {
 		printk(KERN_NOTICE
@@ -1178,10 +1177,11 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt)
 }
 
 static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
+				       struct net *net,
 				       struct sockaddr *sa, int salen,
 				       int flags)
 {
-	return svc_create_socket(serv, IPPROTO_TCP, sa, salen, flags);
+	return svc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags);
 }
 
 static struct svc_xprt_ops svc_tcp_ops = {
@@ -1258,19 +1258,13 @@ void svc_sock_update_bufs(struct svc_serv *serv)
 	 * The number of server threads has changed. Update
 	 * rcvbuf and sndbuf accordingly on all sockets
 	 */
-	struct list_head *le;
+	struct svc_sock *svsk;
 
 	spin_lock_bh(&serv->sv_lock);
-	list_for_each(le, &serv->sv_permsocks) {
-		struct svc_sock *svsk =
-			list_entry(le, struct svc_sock, sk_xprt.xpt_list);
+	list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list)
 		set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
-	}
-	list_for_each(le, &serv->sv_tempsocks) {
-		struct svc_sock *svsk =
-			list_entry(le, struct svc_sock, sk_xprt.xpt_list);
+	list_for_each_entry(svsk, &serv->sv_tempsocks, sk_xprt.xpt_list)
 		set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
-	}
 	spin_unlock_bh(&serv->sv_lock);
 }
 EXPORT_SYMBOL_GPL(svc_sock_update_bufs);
@@ -1385,6 +1379,7 @@ EXPORT_SYMBOL_GPL(svc_addsock);
  */
 static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
 					  int protocol,
+					  struct net *net,
 					  struct sockaddr *sin, int len,
 					  int flags)
 {
@@ -1421,7 +1416,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
 		return ERR_PTR(-EINVAL);
 	}
 
-	error = sock_create_kern(family, type, protocol, &sock);
+	error = __sock_create(net, family, type, protocol, &sock, 1);
 	if (error < 0)
 		return ERR_PTR(error);
 
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index a1f82a87d34..cd9e841e749 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -111,6 +111,23 @@ xdr_decode_string_inplace(__be32 *p, char **sp,
 }
 EXPORT_SYMBOL_GPL(xdr_decode_string_inplace);
 
+/**
+ * xdr_terminate_string - '\0'-terminate a string residing in an xdr_buf
+ * @buf: XDR buffer where string resides
+ * @len: length of string, in bytes
+ *
+ */
+void
+xdr_terminate_string(struct xdr_buf *buf, const u32 len)
+{
+	char *kaddr;
+
+	kaddr = kmap_atomic(buf->pages[0], KM_USER0);
+	kaddr[buf->page_base + len] = '\0';
+	kunmap_atomic(kaddr, KM_USER0);
+}
+EXPORT_SYMBOL(xdr_terminate_string);
+
 void
 xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base,
 		 unsigned int len)
@@ -395,24 +412,29 @@ xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
 {
 	struct kvec *tail;
 	size_t copy;
-	char *p;
 	unsigned int pglen = buf->page_len;
+	unsigned int tailbuf_len;
 
 	tail = buf->tail;
 	BUG_ON (len > pglen);
 
+	tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len;
+
 	/* Shift the tail first */
-	if (tail->iov_len != 0) {
-		p = (char *)tail->iov_base + len;
+	if (tailbuf_len != 0) {
+		unsigned int free_space = tailbuf_len - tail->iov_len;
+
+		if (len < free_space)
+			free_space = len;
+		tail->iov_len += free_space;
+
+		copy = len;
 		if (tail->iov_len > len) {
-			copy = tail->iov_len - len;
-			memmove(p, tail->iov_base, copy);
+			char *p = (char *)tail->iov_base + len;
+			memmove(p, tail->iov_base, tail->iov_len - len);
 		} else
-			buf->buflen -= len;
-		/* Copy from the inlined pages into the tail */
-		copy = len;
-		if (copy > tail->iov_len)
 			copy = tail->iov_len;
+		/* Copy from the inlined pages into the tail */
 		_copy_from_pages((char *)tail->iov_base,
 				buf->pages, buf->page_base + pglen - len,
 				copy);
@@ -551,6 +573,27 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
 EXPORT_SYMBOL_GPL(xdr_init_decode);
 
 /**
+ * xdr_inline_peek - Allow read-ahead in the XDR data stream
+ * @xdr: pointer to xdr_stream struct
+ * @nbytes: number of bytes of data to decode
+ *
+ * Check if the input buffer is long enough to enable us to decode
+ * 'nbytes' more bytes of data starting at the current position.
+ * If so return the current pointer without updating the current
+ * pointer position.
+ */
+__be32 * xdr_inline_peek(struct xdr_stream *xdr, size_t nbytes)
+{
+	__be32 *p = xdr->p;
+	__be32 *q = p + XDR_QUADLEN(nbytes);
+
+	if (unlikely(q > xdr->end || q < p))
+		return NULL;
+	return p;
+}
+EXPORT_SYMBOL_GPL(xdr_inline_peek);
+
+/**
  * xdr_inline_decode - Retrieve non-page XDR data to decode
  * @xdr: pointer to xdr_stream struct
  * @nbytes: number of bytes of data to decode
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index dcd0132396b..4c8f18aff7c 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -199,8 +199,6 @@ int xprt_reserve_xprt(struct rpc_task *task)
 	if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) {
 		if (task == xprt->snd_task)
 			return 1;
-		if (task == NULL)
-			return 0;
 		goto out_sleep;
 	}
 	xprt->snd_task = task;
@@ -757,13 +755,11 @@ static void xprt_connect_status(struct rpc_task *task)
  */
 struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
 {
-	struct list_head *pos;
+	struct rpc_rqst *entry;
 
-	list_for_each(pos, &xprt->recv) {
-		struct rpc_rqst *entry = list_entry(pos, struct rpc_rqst, rq_list);
+	list_for_each_entry(entry, &xprt->recv, rq_list)
 		if (entry->rq_xid == xid)
 			return entry;
-	}
 
 	dprintk("RPC:       xprt_lookup_rqst did not find xid %08x\n",
 			ntohl(xid));
@@ -962,6 +958,37 @@ static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
 	spin_unlock(&xprt->reserve_lock);
 }
 
+struct rpc_xprt *xprt_alloc(struct net *net, int size, int max_req)
+{
+	struct rpc_xprt *xprt;
+
+	xprt = kzalloc(size, GFP_KERNEL);
+	if (xprt == NULL)
+		goto out;
+
+	xprt->max_reqs = max_req;
+	xprt->slot = kcalloc(max_req, sizeof(struct rpc_rqst), GFP_KERNEL);
+	if (xprt->slot == NULL)
+		goto out_free;
+
+	xprt->xprt_net = get_net(net);
+	return xprt;
+
+out_free:
+	kfree(xprt);
+out:
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(xprt_alloc);
+
+void xprt_free(struct rpc_xprt *xprt)
+{
+	put_net(xprt->xprt_net);
+	kfree(xprt->slot);
+	kfree(xprt);
+}
+EXPORT_SYMBOL_GPL(xprt_free);
+
 /**
  * xprt_reserve - allocate an RPC request slot
  * @task: RPC task requesting a slot allocation
@@ -1032,6 +1059,8 @@ void xprt_release(struct rpc_task *task)
 	spin_unlock_bh(&xprt->transport_lock);
 	if (req->rq_buffer)
 		xprt->ops->buf_free(req->rq_buffer);
+	if (req->rq_cred != NULL)
+		put_rpccred(req->rq_cred);
 	task->tk_rqstp = NULL;
 	if (req->rq_release_snd_buf)
 		req->rq_release_snd_buf(req);
@@ -1129,6 +1158,7 @@ static void xprt_destroy(struct kref *kref)
 	rpc_destroy_wait_queue(&xprt->sending);
 	rpc_destroy_wait_queue(&xprt->resend);
 	rpc_destroy_wait_queue(&xprt->backlog);
+	cancel_work_sync(&xprt->task_cleanup);
 	/*
 	 * Tear down transport state and free the rpc_xprt
 	 */
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e5e28d1946a..2ac3f6e8adf 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -249,6 +249,8 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
 	req->rl_nchunks = nchunks;
 
 	BUG_ON(nchunks == 0);
+	BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
+	       && (nchunks > 3));
 
 	/*
 	 * finish off header. If write, marshal discrim and nchunks.
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index d718b8fa952..09af4fab1a4 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -43,6 +43,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/sysctl.h>
+#include <linux/workqueue.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/sched.h>
 #include <linux/sunrpc/svc_rdma.h>
@@ -74,6 +75,8 @@ atomic_t rdma_stat_sq_prod;
 struct kmem_cache *svc_rdma_map_cachep;
 struct kmem_cache *svc_rdma_ctxt_cachep;
 
+struct workqueue_struct *svc_rdma_wq;
+
 /*
  * This function implements reading and resetting an atomic_t stat
  * variable through read/write to a proc file. Any write to the file
@@ -231,7 +234,7 @@ static ctl_table svcrdma_root_table[] = {
 void svc_rdma_cleanup(void)
 {
 	dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
-	flush_scheduled_work();
+	destroy_workqueue(svc_rdma_wq);
 	if (svcrdma_table_header) {
 		unregister_sysctl_table(svcrdma_table_header);
 		svcrdma_table_header = NULL;
@@ -249,6 +252,11 @@ int svc_rdma_init(void)
 	dprintk("\tsq_depth         : %d\n",
 		svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
 	dprintk("\tmax_inline       : %d\n", svcrdma_max_req_size);
+
+	svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
+	if (!svc_rdma_wq)
+		return -ENOMEM;
+
 	if (!svcrdma_table_header)
 		svcrdma_table_header =
 			register_sysctl_table(svcrdma_root_table);
@@ -283,6 +291,7 @@ int svc_rdma_init(void)
 	kmem_cache_destroy(svc_rdma_map_cachep);
  err0:
 	unregister_sysctl_table(svcrdma_table_header);
+	destroy_workqueue(svc_rdma_wq);
 	return -ENOMEM;
 }
 MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 0194de81493..df67211c4ba 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -263,9 +263,9 @@ static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
 	frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT;
 	for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
 		frmr->page_list->page_list[page_no] =
-			ib_dma_map_single(xprt->sc_cm_id->device,
-					  page_address(rqstp->rq_arg.pages[page_no]),
-					  PAGE_SIZE, DMA_FROM_DEVICE);
+			ib_dma_map_page(xprt->sc_cm_id->device,
+					rqstp->rq_arg.pages[page_no], 0,
+					PAGE_SIZE, DMA_FROM_DEVICE);
 		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
 					 frmr->page_list->page_list[page_no]))
 			goto fatal_err;
@@ -309,17 +309,21 @@ static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
 			     int count)
 {
 	int i;
+	unsigned long off;
 
 	ctxt->count = count;
 	ctxt->direction = DMA_FROM_DEVICE;
 	for (i = 0; i < count; i++) {
 		ctxt->sge[i].length = 0; /* in case map fails */
 		if (!frmr) {
+			BUG_ON(0 == virt_to_page(vec[i].iov_base));
+			off = (unsigned long)vec[i].iov_base & ~PAGE_MASK;
 			ctxt->sge[i].addr =
-				ib_dma_map_single(xprt->sc_cm_id->device,
-						  vec[i].iov_base,
-						  vec[i].iov_len,
-						  DMA_FROM_DEVICE);
+				ib_dma_map_page(xprt->sc_cm_id->device,
+						virt_to_page(vec[i].iov_base),
+						off,
+						vec[i].iov_len,
+						DMA_FROM_DEVICE);
 			if (ib_dma_mapping_error(xprt->sc_cm_id->device,
 						 ctxt->sge[i].addr))
 				return -EINVAL;
@@ -491,6 +495,7 @@ next_sge:
 			printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n",
 			       err);
 			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+			svc_rdma_unmap_dma(ctxt);
 			svc_rdma_put_context(ctxt, 0);
 			goto out;
 		}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index b15e1ebb2bf..249a835b703 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -70,8 +70,8 @@
  * on extra page for the RPCRMDA header.
  */
 static int fast_reg_xdr(struct svcxprt_rdma *xprt,
-		 struct xdr_buf *xdr,
-		 struct svc_rdma_req_map *vec)
+			struct xdr_buf *xdr,
+			struct svc_rdma_req_map *vec)
 {
 	int sge_no;
 	u32 sge_bytes;
@@ -96,21 +96,25 @@ static int fast_reg_xdr(struct svcxprt_rdma *xprt,
 	vec->count = 2;
 	sge_no++;
 
-	/* Build the FRMR */
+	/* Map the XDR head */
 	frmr->kva = frva;
 	frmr->direction = DMA_TO_DEVICE;
 	frmr->access_flags = 0;
 	frmr->map_len = PAGE_SIZE;
 	frmr->page_list_len = 1;
+	page_off = (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
 	frmr->page_list->page_list[page_no] =
-		ib_dma_map_single(xprt->sc_cm_id->device,
-				  (void *)xdr->head[0].iov_base,
-				  PAGE_SIZE, DMA_TO_DEVICE);
+		ib_dma_map_page(xprt->sc_cm_id->device,
+				virt_to_page(xdr->head[0].iov_base),
+				page_off,
+				PAGE_SIZE - page_off,
+				DMA_TO_DEVICE);
 	if (ib_dma_mapping_error(xprt->sc_cm_id->device,
 				 frmr->page_list->page_list[page_no]))
 		goto fatal_err;
 	atomic_inc(&xprt->sc_dma_used);
 
+	/* Map the XDR page list */
 	page_off = xdr->page_base;
 	page_bytes = xdr->page_len + page_off;
 	if (!page_bytes)
@@ -128,9 +132,9 @@ static int fast_reg_xdr(struct svcxprt_rdma *xprt,
 		page_bytes -= sge_bytes;
 
 		frmr->page_list->page_list[page_no] =
-			ib_dma_map_single(xprt->sc_cm_id->device,
-					  page_address(page),
-					  PAGE_SIZE, DMA_TO_DEVICE);
+			ib_dma_map_page(xprt->sc_cm_id->device,
+					page, page_off,
+					sge_bytes, DMA_TO_DEVICE);
 		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
 					 frmr->page_list->page_list[page_no]))
 			goto fatal_err;
@@ -166,8 +170,10 @@ static int fast_reg_xdr(struct svcxprt_rdma *xprt,
 		vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
 
 		frmr->page_list->page_list[page_no] =
-			ib_dma_map_single(xprt->sc_cm_id->device, va, PAGE_SIZE,
-					  DMA_TO_DEVICE);
+		    ib_dma_map_page(xprt->sc_cm_id->device, virt_to_page(va),
+				    page_off,
+				    PAGE_SIZE,
+				    DMA_TO_DEVICE);
 		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
 					 frmr->page_list->page_list[page_no]))
 			goto fatal_err;
@@ -245,6 +251,35 @@ static int map_xdr(struct svcxprt_rdma *xprt,
 	return 0;
 }
 
+static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
+			      struct xdr_buf *xdr,
+			      u32 xdr_off, size_t len, int dir)
+{
+	struct page *page;
+	dma_addr_t dma_addr;
+	if (xdr_off < xdr->head[0].iov_len) {
+		/* This offset is in the head */
+		xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
+		page = virt_to_page(xdr->head[0].iov_base);
+	} else {
+		xdr_off -= xdr->head[0].iov_len;
+		if (xdr_off < xdr->page_len) {
+			/* This offset is in the page list */
+			page = xdr->pages[xdr_off >> PAGE_SHIFT];
+			xdr_off &= ~PAGE_MASK;
+		} else {
+			/* This offset is in the tail */
+			xdr_off -= xdr->page_len;
+			xdr_off += (unsigned long)
+				xdr->tail[0].iov_base & ~PAGE_MASK;
+			page = virt_to_page(xdr->tail[0].iov_base);
+		}
+	}
+	dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off,
+				   min_t(size_t, PAGE_SIZE, len), dir);
+	return dma_addr;
+}
+
 /* Assumptions:
  * - We are using FRMR
  *     - or -
@@ -293,10 +328,9 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
 		sge[sge_no].length = sge_bytes;
 		if (!vec->frmr) {
 			sge[sge_no].addr =
-				ib_dma_map_single(xprt->sc_cm_id->device,
-						  (void *)
-						  vec->sge[xdr_sge_no].iov_base + sge_off,
-						  sge_bytes, DMA_TO_DEVICE);
+				dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
+					    sge_bytes, DMA_TO_DEVICE);
+			xdr_off += sge_bytes;
 			if (ib_dma_mapping_error(xprt->sc_cm_id->device,
 						 sge[sge_no].addr))
 				goto err;
@@ -333,6 +367,8 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
 		goto err;
 	return 0;
  err:
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_frmr(xprt, vec->frmr);
 	svc_rdma_put_context(ctxt, 0);
 	/* Fatal error, close transport */
 	return -EIO;
@@ -494,7 +530,8 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
  * In all three cases, this function prepares the RPCRDMA header in
  * sge[0], the 'type' parameter indicates the type to place in the
  * RPCRDMA header, and the 'byte_count' field indicates how much of
- * the XDR to include in this RDMA_SEND.
+ * the XDR to include in this RDMA_SEND. NB: The offset of the payload
+ * to send is zero in the XDR.
  */
 static int send_reply(struct svcxprt_rdma *rdma,
 		      struct svc_rqst *rqstp,
@@ -536,23 +573,24 @@ static int send_reply(struct svcxprt_rdma *rdma,
 	ctxt->sge[0].lkey = rdma->sc_dma_lkey;
 	ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
 	ctxt->sge[0].addr =
-		ib_dma_map_single(rdma->sc_cm_id->device, page_address(page),
-				  ctxt->sge[0].length, DMA_TO_DEVICE);
+	    ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
+			    ctxt->sge[0].length, DMA_TO_DEVICE);
 	if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
 		goto err;
 	atomic_inc(&rdma->sc_dma_used);
 
 	ctxt->direction = DMA_TO_DEVICE;
 
-	/* Determine how many of our SGE are to be transmitted */
+	/* Map the payload indicated by 'byte_count' */
 	for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
+		int xdr_off = 0;
 		sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
 		byte_count -= sge_bytes;
 		if (!vec->frmr) {
 			ctxt->sge[sge_no].addr =
-				ib_dma_map_single(rdma->sc_cm_id->device,
-						  vec->sge[sge_no].iov_base,
-						  sge_bytes, DMA_TO_DEVICE);
+				dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
+					    sge_bytes, DMA_TO_DEVICE);
+			xdr_off += sge_bytes;
 			if (ib_dma_mapping_error(rdma->sc_cm_id->device,
 						 ctxt->sge[sge_no].addr))
 				goto err;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index edea15a54e5..9df1eadc912 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -45,6 +45,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/workqueue.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
 #include <linux/sunrpc/svc_rdma.h>
@@ -52,6 +53,7 @@
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
 static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
+					struct net *net,
 					struct sockaddr *sa, int salen,
 					int flags);
 static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
@@ -89,6 +91,9 @@ struct svc_xprt_class svc_rdma_class = {
 /* WR context cache. Created in svc_rdma.c  */
 extern struct kmem_cache *svc_rdma_ctxt_cachep;
 
+/* Workqueue created in svc_rdma.c */
+extern struct workqueue_struct *svc_rdma_wq;
+
 struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
 {
 	struct svc_rdma_op_ctxt *ctxt;
@@ -120,7 +125,7 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
 		 */
 		if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
 			atomic_dec(&xprt->sc_dma_used);
-			ib_dma_unmap_single(xprt->sc_cm_id->device,
+			ib_dma_unmap_page(xprt->sc_cm_id->device,
 					    ctxt->sge[i].addr,
 					    ctxt->sge[i].length,
 					    ctxt->direction);
@@ -502,8 +507,8 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
 		BUG_ON(sge_no >= xprt->sc_max_sge);
 		page = svc_rdma_get_page();
 		ctxt->pages[sge_no] = page;
-		pa = ib_dma_map_single(xprt->sc_cm_id->device,
-				     page_address(page), PAGE_SIZE,
+		pa = ib_dma_map_page(xprt->sc_cm_id->device,
+				     page, 0, PAGE_SIZE,
 				     DMA_FROM_DEVICE);
 		if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
 			goto err_put_ctxt;
@@ -511,9 +516,9 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
 		ctxt->sge[sge_no].addr = pa;
 		ctxt->sge[sge_no].length = PAGE_SIZE;
 		ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
+		ctxt->count = sge_no + 1;
 		buflen += PAGE_SIZE;
 	}
-	ctxt->count = sge_no;
 	recv_wr.next = NULL;
 	recv_wr.sg_list = &ctxt->sge[0];
 	recv_wr.num_sge = ctxt->count;
@@ -529,6 +534,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
 	return ret;
 
  err_put_ctxt:
+	svc_rdma_unmap_dma(ctxt);
 	svc_rdma_put_context(ctxt, 1);
 	return -ENOMEM;
 }
@@ -670,6 +676,7 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id,
  * Create a listening RDMA service endpoint.
  */
 static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
+					struct net *net,
 					struct sockaddr *sa, int salen,
 					int flags)
 {
@@ -798,8 +805,8 @@ static void frmr_unmap_dma(struct svcxprt_rdma *xprt,
 		if (ib_dma_mapping_error(frmr->mr->device, addr))
 			continue;
 		atomic_dec(&xprt->sc_dma_used);
-		ib_dma_unmap_single(frmr->mr->device, addr, PAGE_SIZE,
-				    frmr->direction);
+		ib_dma_unmap_page(frmr->mr->device, addr, PAGE_SIZE,
+				  frmr->direction);
 	}
 }
 
@@ -1184,7 +1191,7 @@ static void svc_rdma_free(struct svc_xprt *xprt)
 	struct svcxprt_rdma *rdma =
 		container_of(xprt, struct svcxprt_rdma, sc_xprt);
 	INIT_WORK(&rdma->sc_work, __svc_rdma_free);
-	schedule_work(&rdma->sc_work);
+	queue_work(svc_rdma_wq, &rdma->sc_work);
 }
 
 static int svc_rdma_has_wspace(struct svc_xprt *xprt)
@@ -1274,7 +1281,7 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
 				   atomic_read(&xprt->sc_sq_count) <
 				   xprt->sc_sq_depth);
 			if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
-				return 0;
+				return -ENOTCONN;
 			continue;
 		}
 		/* Take a transport ref for each WR posted */
@@ -1306,7 +1313,6 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 			 enum rpcrdma_errcode err)
 {
 	struct ib_send_wr err_wr;
-	struct ib_sge sge;
 	struct page *p;
 	struct svc_rdma_op_ctxt *ctxt;
 	u32 *va;
@@ -1319,26 +1325,27 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 	/* XDR encode error */
 	length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
 
+	ctxt = svc_rdma_get_context(xprt);
+	ctxt->direction = DMA_FROM_DEVICE;
+	ctxt->count = 1;
+	ctxt->pages[0] = p;
+
 	/* Prepare SGE for local address */
-	sge.addr = ib_dma_map_single(xprt->sc_cm_id->device,
-				   page_address(p), PAGE_SIZE, DMA_FROM_DEVICE);
-	if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge.addr)) {
+	ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device,
+					    p, 0, length, DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) {
 		put_page(p);
 		return;
 	}
 	atomic_inc(&xprt->sc_dma_used);
-	sge.lkey = xprt->sc_dma_lkey;
-	sge.length = length;
-
-	ctxt = svc_rdma_get_context(xprt);
-	ctxt->count = 1;
-	ctxt->pages[0] = p;
+	ctxt->sge[0].lkey = xprt->sc_dma_lkey;
+	ctxt->sge[0].length = length;
 
 	/* Prepare SEND WR */
 	memset(&err_wr, 0, sizeof err_wr);
 	ctxt->wr_op = IB_WR_SEND;
 	err_wr.wr_id = (unsigned long)ctxt;
-	err_wr.sg_list = &sge;
+	err_wr.sg_list = ctxt->sge;
 	err_wr.num_sge = 1;
 	err_wr.opcode = IB_WR_SEND;
 	err_wr.send_flags = IB_SEND_SIGNALED;
@@ -1348,9 +1355,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 	if (ret) {
 		dprintk("svcrdma: Error %d posting send for protocol error\n",
 			ret);
-		ib_dma_unmap_single(xprt->sc_cm_id->device,
-				  sge.addr, PAGE_SIZE,
-				  DMA_FROM_DEVICE);
+		svc_rdma_unmap_dma(ctxt);
 		svc_rdma_put_context(ctxt, 1);
 	}
 }
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index a85e866a77f..0867070bb5c 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -237,8 +237,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
 
 	dprintk("RPC:       %s: called\n", __func__);
 
-	cancel_delayed_work(&r_xprt->rdma_connect);
-	flush_scheduled_work();
+	cancel_delayed_work_sync(&r_xprt->rdma_connect);
 
 	xprt_clear_connected(xprt);
 
@@ -251,9 +250,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
 
 	xprt_rdma_free_addresses(xprt);
 
-	kfree(xprt->slot);
-	xprt->slot = NULL;
-	kfree(xprt);
+	xprt_free(xprt);
 
 	dprintk("RPC:       %s: returning\n", __func__);
 
@@ -285,23 +282,14 @@ xprt_setup_rdma(struct xprt_create *args)
 		return ERR_PTR(-EBADF);
 	}
 
-	xprt = kzalloc(sizeof(struct rpcrdma_xprt), GFP_KERNEL);
+	xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt),
+			xprt_rdma_slot_table_entries);
 	if (xprt == NULL) {
 		dprintk("RPC:       %s: couldn't allocate rpcrdma_xprt\n",
 			__func__);
 		return ERR_PTR(-ENOMEM);
 	}
 
-	xprt->max_reqs = xprt_rdma_slot_table_entries;
-	xprt->slot = kcalloc(xprt->max_reqs,
-				sizeof(struct rpc_rqst), GFP_KERNEL);
-	if (xprt->slot == NULL) {
-		dprintk("RPC:       %s: couldn't allocate %d slots\n",
-			__func__, xprt->max_reqs);
-		kfree(xprt);
-		return ERR_PTR(-ENOMEM);
-	}
-
 	/* 60 second timeout, no retries */
 	xprt->timeout = &xprt_rdma_default_timeout;
 	xprt->bind_timeout = (60U * HZ);
@@ -410,8 +398,7 @@ out3:
 out2:
 	rpcrdma_ia_close(&new_xprt->rx_ia);
 out1:
-	kfree(xprt->slot);
-	kfree(xprt);
+	xprt_free(xprt);
 	return ERR_PTR(rc);
 }
 
@@ -460,7 +447,7 @@ xprt_rdma_connect(struct rpc_task *task)
 	} else {
 		schedule_delayed_work(&r_xprt->rdma_connect, 0);
 		if (!RPC_IS_ASYNC(task))
-			flush_scheduled_work();
+			flush_delayed_work(&r_xprt->rdma_connect);
 	}
 }
 
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 27015c6d8eb..5f4c7b3bc71 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -650,10 +650,22 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 	ep->rep_attr.cap.max_send_wr = cdata->max_requests;
 	switch (ia->ri_memreg_strategy) {
 	case RPCRDMA_FRMR:
-		/* Add room for frmr register and invalidate WRs */
-		ep->rep_attr.cap.max_send_wr *= 3;
-		if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
-			return -EINVAL;
+		/* Add room for frmr register and invalidate WRs.
+		 * 1. FRMR reg WR for head
+		 * 2. FRMR invalidate WR for head
+		 * 3. FRMR reg WR for pagelist
+		 * 4. FRMR invalidate WR for pagelist
+		 * 5. FRMR reg WR for tail
+		 * 6. FRMR invalidate WR for tail
+		 * 7. The RDMA_SEND WR
+		 */
+		ep->rep_attr.cap.max_send_wr *= 7;
+		if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
+			cdata->max_requests = devattr.max_qp_wr / 7;
+			if (!cdata->max_requests)
+				return -EINVAL;
+			ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
+		}
 		break;
 	case RPCRDMA_MEMWINDOWS_ASYNC:
 	case RPCRDMA_MEMWINDOWS:
@@ -1490,7 +1502,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
 	memset(&frmr_wr, 0, sizeof frmr_wr);
 	frmr_wr.opcode = IB_WR_FAST_REG_MR;
 	frmr_wr.send_flags = 0;			/* unsignaled */
-	frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma;
+	frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
 	frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
 	frmr_wr.wr.fast_reg.page_list_len = i;
 	frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 7ca65c7005e..dfcab5ac65a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -774,8 +774,7 @@ static void xs_destroy(struct rpc_xprt *xprt)
 
 	xs_close(xprt);
 	xs_free_peer_addresses(xprt);
-	kfree(xprt->slot);
-	kfree(xprt);
+	xprt_free(xprt);
 	module_put(THIS_MODULE);
 }
 
@@ -800,7 +799,7 @@ static void xs_udp_data_ready(struct sock *sk, int len)
 	u32 _xid;
 	__be32 *xp;
 
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 	dprintk("RPC:       xs_udp_data_ready...\n");
 	if (!(xprt = xprt_from_sock(sk)))
 		goto out;
@@ -852,7 +851,7 @@ static void xs_udp_data_ready(struct sock *sk, int len)
  dropit:
 	skb_free_datagram(sk, skb);
  out:
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 }
 
 static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc)
@@ -1229,7 +1228,7 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes)
 
 	dprintk("RPC:       xs_tcp_data_ready...\n");
 
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 	if (!(xprt = xprt_from_sock(sk)))
 		goto out;
 	if (xprt->shutdown)
@@ -1248,7 +1247,7 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes)
 		read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
 	} while (read > 0);
 out:
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 }
 
 /*
@@ -1301,18 +1300,19 @@ static void xs_tcp_state_change(struct sock *sk)
 {
 	struct rpc_xprt *xprt;
 
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 	if (!(xprt = xprt_from_sock(sk)))
 		goto out;
 	dprintk("RPC:       xs_tcp_state_change client %p...\n", xprt);
-	dprintk("RPC:       state %x conn %d dead %d zapped %d\n",
+	dprintk("RPC:       state %x conn %d dead %d zapped %d sk_shutdown %d\n",
 			sk->sk_state, xprt_connected(xprt),
 			sock_flag(sk, SOCK_DEAD),
-			sock_flag(sk, SOCK_ZAPPED));
+			sock_flag(sk, SOCK_ZAPPED),
+			sk->sk_shutdown);
 
 	switch (sk->sk_state) {
 	case TCP_ESTABLISHED:
-		spin_lock_bh(&xprt->transport_lock);
+		spin_lock(&xprt->transport_lock);
 		if (!xprt_test_and_set_connected(xprt)) {
 			struct sock_xprt *transport = container_of(xprt,
 					struct sock_xprt, xprt);
@@ -1326,7 +1326,7 @@ static void xs_tcp_state_change(struct sock *sk)
 
 			xprt_wake_pending_tasks(xprt, -EAGAIN);
 		}
-		spin_unlock_bh(&xprt->transport_lock);
+		spin_unlock(&xprt->transport_lock);
 		break;
 	case TCP_FIN_WAIT1:
 		/* The client initiated a shutdown of the socket */
@@ -1364,7 +1364,7 @@ static void xs_tcp_state_change(struct sock *sk)
 		xs_sock_mark_closed(xprt);
 	}
  out:
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 }
 
 /**
@@ -1375,7 +1375,7 @@ static void xs_error_report(struct sock *sk)
 {
 	struct rpc_xprt *xprt;
 
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 	if (!(xprt = xprt_from_sock(sk)))
 		goto out;
 	dprintk("RPC:       %s client %p...\n"
@@ -1383,7 +1383,7 @@ static void xs_error_report(struct sock *sk)
 			__func__, xprt, sk->sk_err);
 	xprt_wake_pending_tasks(xprt, -EAGAIN);
 out:
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 }
 
 static void xs_write_space(struct sock *sk)
@@ -1415,13 +1415,13 @@ static void xs_write_space(struct sock *sk)
  */
 static void xs_udp_write_space(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 
 	/* from net/core/sock.c:sock_def_write_space */
 	if (sock_writeable(sk))
 		xs_write_space(sk);
 
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 }
 
 /**
@@ -1436,13 +1436,13 @@ static void xs_udp_write_space(struct sock *sk)
  */
 static void xs_tcp_write_space(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
+	read_lock_bh(&sk->sk_callback_lock);
 
 	/* from net/core/stream.c:sk_stream_write_space */
 	if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
 		xs_write_space(sk);
 
-	read_unlock(&sk->sk_callback_lock);
+	read_unlock_bh(&sk->sk_callback_lock);
 }
 
 static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt)
@@ -1515,7 +1515,7 @@ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
 	xs_update_peer_port(xprt);
 }
 
-static unsigned short xs_get_srcport(struct sock_xprt *transport, struct socket *sock)
+static unsigned short xs_get_srcport(struct sock_xprt *transport)
 {
 	unsigned short port = transport->srcport;
 
@@ -1524,7 +1524,7 @@ static unsigned short xs_get_srcport(struct sock_xprt *transport, struct socket
 	return port;
 }
 
-static unsigned short xs_next_srcport(struct sock_xprt *transport, struct socket *sock, unsigned short port)
+static unsigned short xs_next_srcport(struct sock_xprt *transport, unsigned short port)
 {
 	if (transport->srcport != 0)
 		transport->srcport = 0;
@@ -1534,23 +1534,18 @@ static unsigned short xs_next_srcport(struct sock_xprt *transport, struct socket
 		return xprt_max_resvport;
 	return --port;
 }
-
-static int xs_bind4(struct sock_xprt *transport, struct socket *sock)
+static int xs_bind(struct sock_xprt *transport, struct socket *sock)
 {
-	struct sockaddr_in myaddr = {
-		.sin_family = AF_INET,
-	};
-	struct sockaddr_in *sa;
+	struct sockaddr_storage myaddr;
 	int err, nloop = 0;
-	unsigned short port = xs_get_srcport(transport, sock);
+	unsigned short port = xs_get_srcport(transport);
 	unsigned short last;
 
-	sa = (struct sockaddr_in *)&transport->srcaddr;
-	myaddr.sin_addr = sa->sin_addr;
+	memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen);
 	do {
-		myaddr.sin_port = htons(port);
-		err = kernel_bind(sock, (struct sockaddr *) &myaddr,
-						sizeof(myaddr));
+		rpc_set_port((struct sockaddr *)&myaddr, port);
+		err = kernel_bind(sock, (struct sockaddr *)&myaddr,
+				transport->xprt.addrlen);
 		if (port == 0)
 			break;
 		if (err == 0) {
@@ -1558,48 +1553,23 @@ static int xs_bind4(struct sock_xprt *transport, struct socket *sock)
 			break;
 		}
 		last = port;
-		port = xs_next_srcport(transport, sock, port);
+		port = xs_next_srcport(transport, port);
 		if (port > last)
 			nloop++;
 	} while (err == -EADDRINUSE && nloop != 2);
-	dprintk("RPC:       %s %pI4:%u: %s (%d)\n",
-			__func__, &myaddr.sin_addr,
-			port, err ? "failed" : "ok", err);
-	return err;
-}
 
-static int xs_bind6(struct sock_xprt *transport, struct socket *sock)
-{
-	struct sockaddr_in6 myaddr = {
-		.sin6_family = AF_INET6,
-	};
-	struct sockaddr_in6 *sa;
-	int err, nloop = 0;
-	unsigned short port = xs_get_srcport(transport, sock);
-	unsigned short last;
-
-	sa = (struct sockaddr_in6 *)&transport->srcaddr;
-	myaddr.sin6_addr = sa->sin6_addr;
-	do {
-		myaddr.sin6_port = htons(port);
-		err = kernel_bind(sock, (struct sockaddr *) &myaddr,
-						sizeof(myaddr));
-		if (port == 0)
-			break;
-		if (err == 0) {
-			transport->srcport = port;
-			break;
-		}
-		last = port;
-		port = xs_next_srcport(transport, sock, port);
-		if (port > last)
-			nloop++;
-	} while (err == -EADDRINUSE && nloop != 2);
-	dprintk("RPC:       xs_bind6 %pI6:%u: %s (%d)\n",
-		&myaddr.sin6_addr, port, err ? "failed" : "ok", err);
+	if (myaddr.ss_family == AF_INET)
+		dprintk("RPC:       %s %pI4:%u: %s (%d)\n", __func__,
+				&((struct sockaddr_in *)&myaddr)->sin_addr,
+				port, err ? "failed" : "ok", err);
+	else
+		dprintk("RPC:       %s %pI6:%u: %s (%d)\n", __func__,
+				&((struct sockaddr_in6 *)&myaddr)->sin6_addr,
+				port, err ? "failed" : "ok", err);
 	return err;
 }
 
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key xs_key[2];
 static struct lock_class_key xs_slock_key[2];
@@ -1621,6 +1591,18 @@ static inline void xs_reclassify_socket6(struct socket *sock)
 	sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC",
 		&xs_slock_key[1], "sk_lock-AF_INET6-RPC", &xs_key[1]);
 }
+
+static inline void xs_reclassify_socket(int family, struct socket *sock)
+{
+	switch (family) {
+	case AF_INET:
+		xs_reclassify_socket4(sock);
+		break;
+	case AF_INET6:
+		xs_reclassify_socket6(sock);
+		break;
+	}
+}
 #else
 static inline void xs_reclassify_socket4(struct socket *sock)
 {
@@ -1629,8 +1611,36 @@ static inline void xs_reclassify_socket4(struct socket *sock)
 static inline void xs_reclassify_socket6(struct socket *sock)
 {
 }
+
+static inline void xs_reclassify_socket(int family, struct socket *sock)
+{
+}
 #endif
 
+static struct socket *xs_create_sock(struct rpc_xprt *xprt,
+		struct sock_xprt *transport, int family, int type, int protocol)
+{
+	struct socket *sock;
+	int err;
+
+	err = __sock_create(xprt->xprt_net, family, type, protocol, &sock, 1);
+	if (err < 0) {
+		dprintk("RPC:       can't create %d transport socket (%d).\n",
+				protocol, -err);
+		goto out;
+	}
+	xs_reclassify_socket(family, sock);
+
+	if (xs_bind(transport, sock)) {
+		sock_release(sock);
+		goto out;
+	}
+
+	return sock;
+out:
+	return ERR_PTR(err);
+}
+
 static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -1660,82 +1670,23 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 	xs_udp_do_set_buffer_size(xprt);
 }
 
-/**
- * xs_udp_connect_worker4 - set up a UDP socket
- * @work: RPC transport to connect
- *
- * Invoked by a work queue tasklet.
- */
-static void xs_udp_connect_worker4(struct work_struct *work)
+static void xs_udp_setup_socket(struct work_struct *work)
 {
 	struct sock_xprt *transport =
 		container_of(work, struct sock_xprt, connect_worker.work);
 	struct rpc_xprt *xprt = &transport->xprt;
 	struct socket *sock = transport->sock;
-	int err, status = -EIO;
-
-	if (xprt->shutdown)
-		goto out;
-
-	/* Start by resetting any existing state */
-	xs_reset_transport(transport);
-
-	err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
-	if (err < 0) {
-		dprintk("RPC:       can't create UDP transport socket (%d).\n", -err);
-		goto out;
-	}
-	xs_reclassify_socket4(sock);
-
-	if (xs_bind4(transport, sock)) {
-		sock_release(sock);
-		goto out;
-	}
-
-	dprintk("RPC:       worker connecting xprt %p via %s to "
-				"%s (port %s)\n", xprt,
-			xprt->address_strings[RPC_DISPLAY_PROTO],
-			xprt->address_strings[RPC_DISPLAY_ADDR],
-			xprt->address_strings[RPC_DISPLAY_PORT]);
-
-	xs_udp_finish_connecting(xprt, sock);
-	status = 0;
-out:
-	xprt_clear_connecting(xprt);
-	xprt_wake_pending_tasks(xprt, status);
-}
-
-/**
- * xs_udp_connect_worker6 - set up a UDP socket
- * @work: RPC transport to connect
- *
- * Invoked by a work queue tasklet.
- */
-static void xs_udp_connect_worker6(struct work_struct *work)
-{
-	struct sock_xprt *transport =
-		container_of(work, struct sock_xprt, connect_worker.work);
-	struct rpc_xprt *xprt = &transport->xprt;
-	struct socket *sock = transport->sock;
-	int err, status = -EIO;
+	int status = -EIO;
 
 	if (xprt->shutdown)
 		goto out;
 
 	/* Start by resetting any existing state */
 	xs_reset_transport(transport);
-
-	err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock);
-	if (err < 0) {
-		dprintk("RPC:       can't create UDP transport socket (%d).\n", -err);
-		goto out;
-	}
-	xs_reclassify_socket6(sock);
-
-	if (xs_bind6(transport, sock) < 0) {
-		sock_release(sock);
+	sock = xs_create_sock(xprt, transport,
+			xs_addr(xprt)->sa_family, SOCK_DGRAM, IPPROTO_UDP);
+	if (IS_ERR(sock))
 		goto out;
-	}
 
 	dprintk("RPC:       worker connecting xprt %p via %s to "
 				"%s (port %s)\n", xprt,
@@ -1754,12 +1705,12 @@ out:
  * We need to preserve the port number so the reply cache on the server can
  * find our cached RPC replies when we get around to reconnecting.
  */
-static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transport)
+static void xs_abort_connection(struct sock_xprt *transport)
 {
 	int result;
 	struct sockaddr any;
 
-	dprintk("RPC:       disconnecting xprt %p to reuse port\n", xprt);
+	dprintk("RPC:       disconnecting xprt %p to reuse port\n", transport);
 
 	/*
 	 * Disconnect the transport socket by doing a connect operation
@@ -1769,21 +1720,36 @@ static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transpo
 	any.sa_family = AF_UNSPEC;
 	result = kernel_connect(transport->sock, &any, sizeof(any), 0);
 	if (!result)
-		xs_sock_mark_closed(xprt);
+		xs_sock_mark_closed(&transport->xprt);
 	else
 		dprintk("RPC:       AF_UNSPEC connect return code %d\n",
 				result);
 }
 
-static void xs_tcp_reuse_connection(struct rpc_xprt *xprt, struct sock_xprt *transport)
+static void xs_tcp_reuse_connection(struct sock_xprt *transport)
 {
 	unsigned int state = transport->inet->sk_state;
 
-	if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED)
-		return;
-	if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT))
-		return;
-	xs_abort_connection(xprt, transport);
+	if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED) {
+		/* we don't need to abort the connection if the socket
+		 * hasn't undergone a shutdown
+		 */
+		if (transport->inet->sk_shutdown == 0)
+			return;
+		dprintk("RPC:       %s: TCP_CLOSEd and sk_shutdown set to %d\n",
+				__func__, transport->inet->sk_shutdown);
+	}
+	if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT)) {
+		/* we don't need to abort the connection if the socket
+		 * hasn't undergone a shutdown
+		 */
+		if (transport->inet->sk_shutdown == 0)
+			return;
+		dprintk("RPC:       %s: ESTABLISHED/SYN_SENT "
+				"sk_shutdown set to %d\n",
+				__func__, transport->inet->sk_shutdown);
+	}
+	xs_abort_connection(transport);
 }
 
 static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
@@ -1836,12 +1802,12 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
  *
  * Invoked by a work queue tasklet.
  */
-static void xs_tcp_setup_socket(struct rpc_xprt *xprt,
-		struct sock_xprt *transport,
-		struct socket *(*create_sock)(struct rpc_xprt *,
-			struct sock_xprt *))
+static void xs_tcp_setup_socket(struct work_struct *work)
 {
+	struct sock_xprt *transport =
+		container_of(work, struct sock_xprt, connect_worker.work);
 	struct socket *sock = transport->sock;
+	struct rpc_xprt *xprt = &transport->xprt;
 	int status = -EIO;
 
 	if (xprt->shutdown)
@@ -1849,7 +1815,8 @@ static void xs_tcp_setup_socket(struct rpc_xprt *xprt,
 
 	if (!sock) {
 		clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
-		sock = create_sock(xprt, transport);
+		sock = xs_create_sock(xprt, transport,
+				xs_addr(xprt)->sa_family, SOCK_STREAM, IPPROTO_TCP);
 		if (IS_ERR(sock)) {
 			status = PTR_ERR(sock);
 			goto out;
@@ -1860,7 +1827,7 @@ static void xs_tcp_setup_socket(struct rpc_xprt *xprt,
 		abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
 				&xprt->state);
 		/* "close" the socket, preserving the local port */
-		xs_tcp_reuse_connection(xprt, transport);
+		xs_tcp_reuse_connection(transport);
 
 		if (abort_and_exit)
 			goto out_eagain;
@@ -1909,84 +1876,6 @@ out:
 	xprt_wake_pending_tasks(xprt, status);
 }
 
-static struct socket *xs_create_tcp_sock4(struct rpc_xprt *xprt,
-		struct sock_xprt *transport)
-{
-	struct socket *sock;
-	int err;
-
-	/* start from scratch */
-	err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
-	if (err < 0) {
-		dprintk("RPC:       can't create TCP transport socket (%d).\n",
-				-err);
-		goto out_err;
-	}
-	xs_reclassify_socket4(sock);
-
-	if (xs_bind4(transport, sock) < 0) {
-		sock_release(sock);
-		goto out_err;
-	}
-	return sock;
-out_err:
-	return ERR_PTR(-EIO);
-}
-
-/**
- * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint
- * @work: RPC transport to connect
- *
- * Invoked by a work queue tasklet.
- */
-static void xs_tcp_connect_worker4(struct work_struct *work)
-{
-	struct sock_xprt *transport =
-		container_of(work, struct sock_xprt, connect_worker.work);
-	struct rpc_xprt *xprt = &transport->xprt;
-
-	xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock4);
-}
-
-static struct socket *xs_create_tcp_sock6(struct rpc_xprt *xprt,
-		struct sock_xprt *transport)
-{
-	struct socket *sock;
-	int err;
-
-	/* start from scratch */
-	err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock);
-	if (err < 0) {
-		dprintk("RPC:       can't create TCP transport socket (%d).\n",
-				-err);
-		goto out_err;
-	}
-	xs_reclassify_socket6(sock);
-
-	if (xs_bind6(transport, sock) < 0) {
-		sock_release(sock);
-		goto out_err;
-	}
-	return sock;
-out_err:
-	return ERR_PTR(-EIO);
-}
-
-/**
- * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint
- * @work: RPC transport to connect
- *
- * Invoked by a work queue tasklet.
- */
-static void xs_tcp_connect_worker6(struct work_struct *work)
-{
-	struct sock_xprt *transport =
-		container_of(work, struct sock_xprt, connect_worker.work);
-	struct rpc_xprt *xprt = &transport->xprt;
-
-	xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock6);
-}
-
 /**
  * xs_connect - connect a socket to a remote endpoint
  * @task: address of RPC task that manages state of connect request
@@ -2246,6 +2135,31 @@ static struct rpc_xprt_ops bc_tcp_ops = {
 	.print_stats		= xs_tcp_print_stats,
 };
 
+static int xs_init_anyaddr(const int family, struct sockaddr *sap)
+{
+	static const struct sockaddr_in sin = {
+		.sin_family		= AF_INET,
+		.sin_addr.s_addr	= htonl(INADDR_ANY),
+	};
+	static const struct sockaddr_in6 sin6 = {
+		.sin6_family		= AF_INET6,
+		.sin6_addr		= IN6ADDR_ANY_INIT,
+	};
+
+	switch (family) {
+	case AF_INET:
+		memcpy(sap, &sin, sizeof(sin));
+		break;
+	case AF_INET6:
+		memcpy(sap, &sin6, sizeof(sin6));
+		break;
+	default:
+		dprintk("RPC:       %s: Bad address family\n", __func__);
+		return -EAFNOSUPPORT;
+	}
+	return 0;
+}
+
 static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
 				      unsigned int slot_table_size)
 {
@@ -2257,27 +2171,25 @@ static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
 		return ERR_PTR(-EBADF);
 	}
 
-	new = kzalloc(sizeof(*new), GFP_KERNEL);
-	if (new == NULL) {
+	xprt = xprt_alloc(args->net, sizeof(*new), slot_table_size);
+	if (xprt == NULL) {
 		dprintk("RPC:       xs_setup_xprt: couldn't allocate "
 				"rpc_xprt\n");
 		return ERR_PTR(-ENOMEM);
 	}
-	xprt = &new->xprt;
-
-	xprt->max_reqs = slot_table_size;
-	xprt->slot = kcalloc(xprt->max_reqs, sizeof(struct rpc_rqst), GFP_KERNEL);
-	if (xprt->slot == NULL) {
-		kfree(xprt);
-		dprintk("RPC:       xs_setup_xprt: couldn't allocate slot "
-				"table\n");
-		return ERR_PTR(-ENOMEM);
-	}
 
+	new = container_of(xprt, struct sock_xprt, xprt);
 	memcpy(&xprt->addr, args->dstaddr, args->addrlen);
 	xprt->addrlen = args->addrlen;
 	if (args->srcaddr)
 		memcpy(&new->srcaddr, args->srcaddr, args->addrlen);
+	else {
+		int err;
+		err = xs_init_anyaddr(args->dstaddr->sa_family,
+					(struct sockaddr *)&new->srcaddr);
+		if (err != 0)
+			return ERR_PTR(err);
+	}
 
 	return xprt;
 }
@@ -2325,7 +2237,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
 			xprt_set_bound(xprt);
 
 		INIT_DELAYED_WORK(&transport->connect_worker,
-					xs_udp_connect_worker4);
+					xs_udp_setup_socket);
 		xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP);
 		break;
 	case AF_INET6:
@@ -2333,7 +2245,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
 			xprt_set_bound(xprt);
 
 		INIT_DELAYED_WORK(&transport->connect_worker,
-					xs_udp_connect_worker6);
+					xs_udp_setup_socket);
 		xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP6);
 		break;
 	default:
@@ -2355,8 +2267,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
 		return xprt;
 	ret = ERR_PTR(-EINVAL);
 out_err:
-	kfree(xprt->slot);
-	kfree(xprt);
+	xprt_free(xprt);
 	return ret;
 }
 
@@ -2400,7 +2311,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
 			xprt_set_bound(xprt);
 
 		INIT_DELAYED_WORK(&transport->connect_worker,
-					xs_tcp_connect_worker4);
+					xs_tcp_setup_socket);
 		xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP);
 		break;
 	case AF_INET6:
@@ -2408,7 +2319,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
 			xprt_set_bound(xprt);
 
 		INIT_DELAYED_WORK(&transport->connect_worker,
-					xs_tcp_connect_worker6);
+					xs_tcp_setup_socket);
 		xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6);
 		break;
 	default:
@@ -2431,8 +2342,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
 		return xprt;
 	ret = ERR_PTR(-EINVAL);
 out_err:
-	kfree(xprt->slot);
-	kfree(xprt);
+	xprt_free(xprt);
 	return ret;
 }
 
@@ -2491,15 +2401,10 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
 		goto out_err;
 	}
 
-	if (xprt_bound(xprt))
-		dprintk("RPC:       set up xprt to %s (port %s) via %s\n",
-				xprt->address_strings[RPC_DISPLAY_ADDR],
-				xprt->address_strings[RPC_DISPLAY_PORT],
-				xprt->address_strings[RPC_DISPLAY_PROTO]);
-	else
-		dprintk("RPC:       set up xprt to %s (autobind) via %s\n",
-				xprt->address_strings[RPC_DISPLAY_ADDR],
-				xprt->address_strings[RPC_DISPLAY_PROTO]);
+	dprintk("RPC:       set up xprt to %s (port %s) via %s\n",
+			xprt->address_strings[RPC_DISPLAY_ADDR],
+			xprt->address_strings[RPC_DISPLAY_PORT],
+			xprt->address_strings[RPC_DISPLAY_PROTO]);
 
 	/*
 	 * Since we don't want connections for the backchannel, we set
@@ -2512,8 +2417,7 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
 		return xprt;
 	ret = ERR_PTR(-EINVAL);
 out_err:
-	kfree(xprt->slot);
-	kfree(xprt);
+	xprt_free(xprt);
 	return ret;
 }
 
@@ -2577,7 +2481,8 @@ void cleanup_socket_xprt(void)
 	xprt_unregister_transport(&xs_bc_tcp_transport);
 }
 
-static int param_set_uint_minmax(const char *val, struct kernel_param *kp,
+static int param_set_uint_minmax(const char *val,
+		const struct kernel_param *kp,
 		unsigned int min, unsigned int max)
 {
 	unsigned long num;
@@ -2592,34 +2497,37 @@ static int param_set_uint_minmax(const char *val, struct kernel_param *kp,
 	return 0;
 }
 
-static int param_set_portnr(const char *val, struct kernel_param *kp)
+static int param_set_portnr(const char *val, const struct kernel_param *kp)
 {
 	return param_set_uint_minmax(val, kp,
 			RPC_MIN_RESVPORT,
 			RPC_MAX_RESVPORT);
 }
 
-static int param_get_portnr(char *buffer, struct kernel_param *kp)
-{
-	return param_get_uint(buffer, kp);
-}
+static struct kernel_param_ops param_ops_portnr = {
+	.set = param_set_portnr,
+	.get = param_get_uint,
+};
+
 #define param_check_portnr(name, p) \
 	__param_check(name, p, unsigned int);
 
 module_param_named(min_resvport, xprt_min_resvport, portnr, 0644);
 module_param_named(max_resvport, xprt_max_resvport, portnr, 0644);
 
-static int param_set_slot_table_size(const char *val, struct kernel_param *kp)
+static int param_set_slot_table_size(const char *val,
+				     const struct kernel_param *kp)
 {
 	return param_set_uint_minmax(val, kp,
 			RPC_MIN_SLOT_TABLE,
 			RPC_MAX_SLOT_TABLE);
 }
 
-static int param_get_slot_table_size(char *buffer, struct kernel_param *kp)
-{
-	return param_get_uint(buffer, kp);
-}
+static struct kernel_param_ops param_ops_slot_table_size = {
+	.set = param_set_slot_table_size,
+	.get = param_get_uint,
+};
+
 #define param_check_slot_table_size(name, p) \
 	__param_check(name, p, unsigned int);
 
diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index c048543ffbe..8a2e89bffde 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -41,11 +41,6 @@
 #include "cluster.h"
 #include "net.h"
 
-u32 tipc_get_addr(void)
-{
-	return tipc_own_addr;
-}
-
 /**
  * tipc_addr_domain_valid - validates a network domain address
  *
@@ -89,7 +84,7 @@ int tipc_addr_domain_valid(u32 addr)
 
 int tipc_addr_node_valid(u32 addr)
 {
-	return (tipc_addr_domain_valid(addr) && tipc_node(addr));
+	return tipc_addr_domain_valid(addr) && tipc_node(addr);
 }
 
 int tipc_in_scope(u32 domain, u32 addr)
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index a008c668930..22a60fc9839 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -121,6 +121,9 @@ static DEFINE_SPINLOCK(bc_lock);
 
 const char tipc_bclink_name[] = "broadcast-link";
 
+static void tipc_nmap_diff(struct tipc_node_map *nm_a,
+			   struct tipc_node_map *nm_b,
+			   struct tipc_node_map *nm_diff);
 
 static u32 buf_seqno(struct sk_buff *buf)
 {
@@ -143,6 +146,19 @@ static void bcbuf_decr_acks(struct sk_buff *buf)
 }
 
 
+static void bclink_set_last_sent(void)
+{
+	if (bcl->next_out)
+		bcl->fsm_msg_cnt = mod(buf_seqno(bcl->next_out) - 1);
+	else
+		bcl->fsm_msg_cnt = mod(bcl->next_out_no - 1);
+}
+
+u32 tipc_bclink_get_last_sent(void)
+{
+	return bcl->fsm_msg_cnt;
+}
+
 /**
  * bclink_set_gap - set gap according to contents of current deferred pkt queue
  *
@@ -171,7 +187,7 @@ static void bclink_set_gap(struct tipc_node *n_ptr)
 
 static int bclink_ack_allowed(u32 n)
 {
-	return((n % TIPC_MIN_LINK_WIN) == tipc_own_tag);
+	return (n % TIPC_MIN_LINK_WIN) == tipc_own_tag;
 }
 
 
@@ -237,8 +253,10 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked)
 
 	/* Try resolving broadcast link congestion, if necessary */
 
-	if (unlikely(bcl->next_out))
+	if (unlikely(bcl->next_out)) {
 		tipc_link_push_queue(bcl);
+		bclink_set_last_sent();
+	}
 	if (unlikely(released && !list_empty(&bcl->waiting_ports)))
 		tipc_link_wakeup_ports(bcl, 0);
 	spin_unlock_bh(&bc_lock);
@@ -272,7 +290,7 @@ static void bclink_send_nack(struct tipc_node *n_ptr)
 	if (!less(n_ptr->bclink.gap_after, n_ptr->bclink.gap_to))
 		return;
 
-	buf = buf_acquire(INT_H_SIZE);
+	buf = tipc_buf_acquire(INT_H_SIZE);
 	if (buf) {
 		msg = buf_msg(buf);
 		tipc_msg_init(msg, BCAST_PROTOCOL, STATE_MSG,
@@ -395,7 +413,7 @@ int tipc_bclink_send_msg(struct sk_buff *buf)
 	if (unlikely(res == -ELINKCONG))
 		buf_discard(buf);
 	else
-		bcl->stats.sent_info++;
+		bclink_set_last_sent();
 
 	if (bcl->out_queue_size > bcl->stats.max_queue_sz)
 		bcl->stats.max_queue_sz = bcl->out_queue_size;
@@ -529,15 +547,6 @@ receive:
 	tipc_node_unlock(node);
 }
 
-u32 tipc_bclink_get_last_sent(void)
-{
-	u32 last_sent = mod(bcl->next_out_no - 1);
-
-	if (bcl->next_out)
-		last_sent = mod(buf_seqno(bcl->next_out) - 1);
-	return last_sent;
-}
-
 u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr)
 {
 	return (n_ptr->bclink.supported &&
@@ -570,6 +579,7 @@ static int tipc_bcbearer_send(struct sk_buff *buf,
 		msg = buf_msg(buf);
 		msg_set_non_seq(msg, 1);
 		msg_set_mc_netid(msg, tipc_net_id);
+		bcl->stats.sent_info++;
 	}
 
 	/* Send buffer over bearers until all targets reached */
@@ -609,11 +619,13 @@ static int tipc_bcbearer_send(struct sk_buff *buf,
 		bcbearer->remains = bcbearer->remains_new;
 	}
 
-	/* Unable to reach all targets */
+	/*
+	 * Unable to reach all targets (indicate success, since currently
+	 * there isn't code in place to properly block & unblock the
+	 * pseudo-bearer used by the broadcast link)
+	 */
 
-	bcbearer->bearer.publ.blocked = 1;
-	bcl->stats.bearer_congs++;
-	return 1;
+	return TIPC_OK;
 }
 
 /**
@@ -862,8 +874,9 @@ void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node)
  * @nm_diff: output node map A-B (i.e. nodes of A that are not in B)
  */
 
-void tipc_nmap_diff(struct tipc_node_map *nm_a, struct tipc_node_map *nm_b,
-				  struct tipc_node_map *nm_diff)
+static void tipc_nmap_diff(struct tipc_node_map *nm_a,
+			   struct tipc_node_map *nm_b,
+			   struct tipc_node_map *nm_diff)
 {
 	int stop = ARRAY_SIZE(nm_a->map);
 	int w;
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index e8c2b81658c..011c03f0a4a 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -84,9 +84,6 @@ static inline int tipc_nmap_equal(struct tipc_node_map *nm_a, struct tipc_node_m
 	return !memcmp(nm_a, nm_b, sizeof(*nm_a));
 }
 
-void tipc_nmap_diff(struct tipc_node_map *nm_a, struct tipc_node_map *nm_b,
-				  struct tipc_node_map *nm_diff);
-
 void tipc_port_list_add(struct port_list *pl_ptr, u32 port);
 void tipc_port_list_free(struct port_list *pl_ptr);
 
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 52ae17b2583..9927d1d56c4 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -63,7 +63,7 @@ static int media_name_valid(const char *name)
 	len = strlen(name);
 	if ((len + 1) > TIPC_MAX_MEDIA_NAME)
 		return 0;
-	return (strspn(name, tipc_alphabet) == len);
+	return strspn(name, tipc_alphabet) == len;
 }
 
 /**
@@ -288,9 +288,6 @@ static struct bearer *bearer_find(const char *name)
 	struct bearer *b_ptr;
 	u32 i;
 
-	if (tipc_mode != TIPC_NET_MODE)
-		return NULL;
-
 	for (i = 0, b_ptr = tipc_bearers; i < MAX_BEARERS; i++, b_ptr++) {
 		if (b_ptr->active && (!strcmp(b_ptr->publ.name, name)))
 			return b_ptr;
@@ -559,8 +556,6 @@ restart:
 	}
 
 	b_ptr = &tipc_bearers[bearer_id];
-	memset(b_ptr, 0, sizeof(struct bearer));
-
 	strcpy(b_ptr->publ.name, name);
 	res = m_ptr->enable_bearer(&b_ptr->publ);
 	if (res) {
@@ -630,30 +625,17 @@ int tipc_block_bearer(const char *name)
  * Note: This routine assumes caller holds tipc_net_lock.
  */
 
-static int bearer_disable(const char *name)
+static int bearer_disable(struct bearer *b_ptr)
 {
-	struct bearer *b_ptr;
 	struct link *l_ptr;
 	struct link *temp_l_ptr;
 
-	b_ptr = bearer_find(name);
-	if (!b_ptr) {
-		warn("Attempt to disable unknown bearer <%s>\n", name);
-		return -EINVAL;
-	}
-
-	info("Disabling bearer <%s>\n", name);
+	info("Disabling bearer <%s>\n", b_ptr->publ.name);
 	tipc_disc_stop_link_req(b_ptr->link_req);
 	spin_lock_bh(&b_ptr->publ.lock);
 	b_ptr->link_req = NULL;
 	b_ptr->publ.blocked = 1;
-	if (b_ptr->media->disable_bearer) {
-		spin_unlock_bh(&b_ptr->publ.lock);
-		write_unlock_bh(&tipc_net_lock);
-		b_ptr->media->disable_bearer(&b_ptr->publ);
-		write_lock_bh(&tipc_net_lock);
-		spin_lock_bh(&b_ptr->publ.lock);
-	}
+	b_ptr->media->disable_bearer(&b_ptr->publ);
 	list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) {
 		tipc_link_delete(l_ptr);
 	}
@@ -664,10 +646,16 @@ static int bearer_disable(const char *name)
 
 int tipc_disable_bearer(const char *name)
 {
+	struct bearer *b_ptr;
 	int res;
 
 	write_lock_bh(&tipc_net_lock);
-	res = bearer_disable(name);
+	b_ptr = bearer_find(name);
+	if (b_ptr == NULL) {
+		warn("Attempt to disable unknown bearer <%s>\n", name);
+		res = -EINVAL;
+	} else
+		res = bearer_disable(b_ptr);
 	write_unlock_bh(&tipc_net_lock);
 	return res;
 }
@@ -680,13 +668,7 @@ void tipc_bearer_stop(void)
 
 	for (i = 0; i < MAX_BEARERS; i++) {
 		if (tipc_bearers[i].active)
-			tipc_bearers[i].publ.blocked = 1;
-	}
-	for (i = 0; i < MAX_BEARERS; i++) {
-		if (tipc_bearers[i].active)
-			bearer_disable(tipc_bearers[i].publ.name);
+			bearer_disable(&tipc_bearers[i]);
 	}
 	media_count = 0;
 }
-
-
diff --git a/net/tipc/cluster.c b/net/tipc/cluster.c
index e68f705381b..7fea14b98b9 100644
--- a/net/tipc/cluster.c
+++ b/net/tipc/cluster.c
@@ -113,25 +113,6 @@ void tipc_cltr_delete(struct cluster *c_ptr)
 	kfree(c_ptr);
 }
 
-u32 tipc_cltr_next_node(struct cluster *c_ptr, u32 addr)
-{
-	struct tipc_node *n_ptr;
-	u32 n_num = tipc_node(addr) + 1;
-
-	if (!c_ptr)
-		return addr;
-	for (; n_num <= c_ptr->highest_node; n_num++) {
-		n_ptr = c_ptr->nodes[n_num];
-		if (n_ptr && tipc_node_has_active_links(n_ptr))
-			return n_ptr->addr;
-	}
-	for (n_num = 1; n_num < tipc_node(addr); n_num++) {
-		n_ptr = c_ptr->nodes[n_num];
-		if (n_ptr && tipc_node_has_active_links(n_ptr))
-			return n_ptr->addr;
-	}
-	return 0;
-}
 
 void tipc_cltr_attach_node(struct cluster *c_ptr, struct tipc_node *n_ptr)
 {
@@ -232,7 +213,7 @@ struct tipc_node *tipc_cltr_select_node(struct cluster *c_ptr, u32 selector)
 static struct sk_buff *tipc_cltr_prepare_routing_msg(u32 data_size, u32 dest)
 {
 	u32 size = INT_H_SIZE + data_size;
-	struct sk_buff *buf = buf_acquire(size);
+	struct sk_buff *buf = tipc_buf_acquire(size);
 	struct tipc_msg *msg;
 
 	if (buf) {
diff --git a/net/tipc/cluster.h b/net/tipc/cluster.h
index 333efb0b9c4..32636d98c9c 100644
--- a/net/tipc/cluster.h
+++ b/net/tipc/cluster.h
@@ -75,7 +75,7 @@ void tipc_cltr_attach_node(struct cluster *c_ptr, struct tipc_node *n_ptr);
 void tipc_cltr_send_slave_routes(struct cluster *c_ptr, u32 dest);
 void tipc_cltr_broadcast(struct sk_buff *buf);
 int tipc_cltr_init(void);
-u32 tipc_cltr_next_node(struct cluster *c_ptr, u32 addr);
+
 void tipc_cltr_bcast_new_route(struct cluster *c_ptr, u32 dest, u32 lo, u32 hi);
 void tipc_cltr_send_local_routes(struct cluster *c_ptr, u32 dest);
 void tipc_cltr_bcast_lost_route(struct cluster *c_ptr, u32 dest, u32 lo, u32 hi);
diff --git a/net/tipc/config.c b/net/tipc/config.c
index 961d1b09714..50a6133a366 100644
--- a/net/tipc/config.c
+++ b/net/tipc/config.c
@@ -95,7 +95,7 @@ int tipc_cfg_append_tlv(struct sk_buff *buf, int tlv_type,
 	return 1;
 }
 
-struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value)
+static struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value)
 {
 	struct sk_buff *buf;
 	__be32 value_net;
@@ -109,6 +109,11 @@ struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value)
 	return buf;
 }
 
+static struct sk_buff *tipc_cfg_reply_unsigned(u32 value)
+{
+	return tipc_cfg_reply_unsigned_type(TIPC_TLV_UNSIGNED, value);
+}
+
 struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string)
 {
 	struct sk_buff *buf;
@@ -120,139 +125,6 @@ struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string)
 	return buf;
 }
 
-
-#if 0
-
-/* Now obsolete code for handling commands not yet implemented the new way */
-
-/*
- * Some of this code assumed that the manager structure contains two added
- * fields:
- *	u32 link_subscriptions;
- *	struct list_head link_subscribers;
- * which are currently not present.  These fields may need to be re-introduced
- * if and when support for link subscriptions is added.
- */
-
-void tipc_cfg_link_event(u32 addr, char *name, int up)
-{
-	/* TIPC DOESN'T HANDLE LINK EVENT SUBSCRIPTIONS AT THE MOMENT */
-}
-
-int tipc_cfg_cmd(const struct tipc_cmd_msg * msg,
-		 char *data,
-		 u32 sz,
-		 u32 *ret_size,
-		 struct tipc_portid *orig)
-{
-	int rv = -EINVAL;
-	u32 cmd = msg->cmd;
-
-	*ret_size = 0;
-	switch (cmd) {
-	case TIPC_REMOVE_LINK:
-	case TIPC_CMD_BLOCK_LINK:
-	case TIPC_CMD_UNBLOCK_LINK:
-		if (!cfg_check_connection(orig))
-			rv = link_control(msg->argv.link_name, msg->cmd, 0);
-		break;
-	case TIPC_ESTABLISH:
-		{
-			int connected;
-
-			tipc_isconnected(mng.conn_port_ref, &connected);
-			if (connected || !orig) {
-				rv = TIPC_FAILURE;
-				break;
-			}
-			rv = tipc_connect2port(mng.conn_port_ref, orig);
-			if (rv == TIPC_OK)
-				orig = 0;
-			break;
-		}
-	case TIPC_GET_PEER_ADDRESS:
-		*ret_size = link_peer_addr(msg->argv.link_name, data, sz);
-		break;
-	case TIPC_GET_ROUTES:
-		rv = TIPC_OK;
-		break;
-	default: {}
-	}
-	if (*ret_size)
-		rv = TIPC_OK;
-	return rv;
-}
-
-static void cfg_cmd_event(struct tipc_cmd_msg *msg,
-			  char *data,
-			  u32 sz,
-			  struct tipc_portid const *orig)
-{
-	int rv = -EINVAL;
-	struct tipc_cmd_result_msg rmsg;
-	struct iovec msg_sect[2];
-	int *arg;
-
-	msg->cmd = ntohl(msg->cmd);
-
-	cfg_prepare_res_msg(msg->cmd, msg->usr_handle, rv, &rmsg, msg_sect,
-			    data, 0);
-	if (ntohl(msg->magic) != TIPC_MAGIC)
-		goto exit;
-
-	switch (msg->cmd) {
-	case TIPC_CREATE_LINK:
-		if (!cfg_check_connection(orig))
-			rv = disc_create_link(&msg->argv.create_link);
-		break;
-	case TIPC_LINK_SUBSCRIBE:
-		{
-			struct subscr_data *sub;
-
-			if (mng.link_subscriptions > 64)
-				break;
-			sub = kmalloc(sizeof(*sub),
-							    GFP_ATOMIC);
-			if (sub == NULL) {
-				warn("Memory squeeze; dropped remote link subscription\n");
-				break;
-			}
-			INIT_LIST_HEAD(&sub->subd_list);
-			tipc_createport(mng.user_ref,
-					(void *)sub,
-					TIPC_HIGH_IMPORTANCE,
-					0,
-					0,
-					(tipc_conn_shutdown_event)cfg_linksubscr_cancel,
-					0,
-					0,
-					(tipc_conn_msg_event)cfg_linksubscr_cancel,
-					0,
-					&sub->port_ref);
-			if (!sub->port_ref) {
-				kfree(sub);
-				break;
-			}
-			memcpy(sub->usr_handle,msg->usr_handle,
-			       sizeof(sub->usr_handle));
-			sub->domain = msg->argv.domain;
-			list_add_tail(&sub->subd_list, &mng.link_subscribers);
-			tipc_connect2port(sub->port_ref, orig);
-			rmsg.retval = TIPC_OK;
-			tipc_send(sub->port_ref, 2u, msg_sect);
-			mng.link_subscriptions++;
-			return;
-		}
-	default:
-		rv = tipc_cfg_cmd(msg, data, sz, (u32 *)&msg_sect[1].iov_len, orig);
-	}
-exit:
-	rmsg.result_len = htonl(msg_sect[1].iov_len);
-	rmsg.retval = htonl(rv);
-	tipc_cfg_respond(msg_sect, 2u, orig);
-}
-#endif
-
 #define MAX_STATS_INFO 2000
 
 static struct sk_buff *tipc_show_stats(void)
@@ -557,14 +429,6 @@ struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area
 	case TIPC_CMD_SHOW_PORTS:
 		rep_tlv_buf = tipc_port_get_ports();
 		break;
-#if 0
-	case TIPC_CMD_SHOW_PORT_STATS:
-		rep_tlv_buf = port_show_stats(req_tlv_area, req_tlv_space);
-		break;
-	case TIPC_CMD_RESET_PORT_STATS:
-		rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED);
-		break;
-#endif
 	case TIPC_CMD_SET_LOG_SIZE:
 		rep_tlv_buf = tipc_log_resize_cmd(req_tlv_area, req_tlv_space);
 		break;
diff --git a/net/tipc/config.h b/net/tipc/config.h
index 5cd7cc56c54..481e12ece71 100644
--- a/net/tipc/config.h
+++ b/net/tipc/config.h
@@ -45,7 +45,6 @@
 struct sk_buff *tipc_cfg_reply_alloc(int payload_size);
 int tipc_cfg_append_tlv(struct sk_buff *buf, int tlv_type,
 			void *tlv_data, int tlv_data_size);
-struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value);
 struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string);
 
 static inline struct sk_buff *tipc_cfg_reply_none(void)
@@ -53,11 +52,6 @@ static inline struct sk_buff *tipc_cfg_reply_none(void)
 	return tipc_cfg_reply_alloc(0);
 }
 
-static inline struct sk_buff *tipc_cfg_reply_unsigned(u32 value)
-{
-	return tipc_cfg_reply_unsigned_type(TIPC_TLV_UNSIGNED, value);
-}
-
 static inline struct sk_buff *tipc_cfg_reply_error_string(char *string)
 {
 	return tipc_cfg_reply_string_type(TIPC_TLV_ERROR_STRING, string);
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 69646811798..e2a09eb8efd 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -96,13 +96,8 @@ int tipc_net_id;
 int tipc_remote_management;
 
 
-int tipc_get_mode(void)
-{
-	return tipc_mode;
-}
-
 /**
- * buf_acquire - creates a TIPC message buffer
+ * tipc_buf_acquire - creates a TIPC message buffer
  * @size: message size (including TIPC header)
  *
  * Returns a new buffer with data pointers set to the specified size.
@@ -111,7 +106,7 @@ int tipc_get_mode(void)
  *       There may also be unrequested tailroom present at the buffer's end.
  */
 
-struct sk_buff *buf_acquire(u32 size)
+struct sk_buff *tipc_buf_acquire(u32 size)
 {
 	struct sk_buff *skb;
 	unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u;
@@ -129,7 +124,7 @@ struct sk_buff *buf_acquire(u32 size)
  * tipc_core_stop_net - shut down TIPC networking sub-systems
  */
 
-void tipc_core_stop_net(void)
+static void tipc_core_stop_net(void)
 {
 	tipc_eth_media_stop();
 	tipc_net_stop();
@@ -154,7 +149,7 @@ int tipc_core_start_net(unsigned long addr)
  * tipc_core_stop - switch TIPC from SINGLE NODE to NOT RUNNING mode
  */
 
-void tipc_core_stop(void)
+static void tipc_core_stop(void)
 {
 	if (tipc_mode != TIPC_NODE_MODE)
 		return;
@@ -169,13 +164,14 @@ void tipc_core_stop(void)
 	tipc_nametbl_stop();
 	tipc_ref_table_stop();
 	tipc_socket_stop();
+	tipc_log_resize(0);
 }
 
 /**
  * tipc_core_start - switch TIPC from NOT RUNNING to SINGLE NODE mode
  */
 
-int tipc_core_start(void)
+static int tipc_core_start(void)
 {
 	int res;
 
@@ -203,7 +199,9 @@ static int __init tipc_init(void)
 {
 	int res;
 
-	tipc_log_resize(CONFIG_TIPC_LOG);
+	if (tipc_log_resize(CONFIG_TIPC_LOG) != 0)
+		warn("Unable to create log buffer\n");
+
 	info("Activated (version " TIPC_MOD_VER
 	     " compiled " __DATE__ " " __TIME__ ")\n");
 
@@ -230,7 +228,6 @@ static void __exit tipc_exit(void)
 	tipc_core_stop_net();
 	tipc_core_stop();
 	info("Deactivated\n");
-	tipc_log_resize(0);
 }
 
 module_init(tipc_init);
@@ -244,8 +241,6 @@ MODULE_VERSION(TIPC_MOD_VER);
 
 EXPORT_SYMBOL(tipc_attach);
 EXPORT_SYMBOL(tipc_detach);
-EXPORT_SYMBOL(tipc_get_addr);
-EXPORT_SYMBOL(tipc_get_mode);
 EXPORT_SYMBOL(tipc_createport);
 EXPORT_SYMBOL(tipc_deleteport);
 EXPORT_SYMBOL(tipc_ownidentity);
@@ -260,23 +255,10 @@ EXPORT_SYMBOL(tipc_withdraw);
 EXPORT_SYMBOL(tipc_connect2port);
 EXPORT_SYMBOL(tipc_disconnect);
 EXPORT_SYMBOL(tipc_shutdown);
-EXPORT_SYMBOL(tipc_isconnected);
-EXPORT_SYMBOL(tipc_peer);
-EXPORT_SYMBOL(tipc_ref_valid);
 EXPORT_SYMBOL(tipc_send);
-EXPORT_SYMBOL(tipc_send_buf);
 EXPORT_SYMBOL(tipc_send2name);
-EXPORT_SYMBOL(tipc_forward2name);
-EXPORT_SYMBOL(tipc_send_buf2name);
-EXPORT_SYMBOL(tipc_forward_buf2name);
 EXPORT_SYMBOL(tipc_send2port);
-EXPORT_SYMBOL(tipc_forward2port);
-EXPORT_SYMBOL(tipc_send_buf2port);
-EXPORT_SYMBOL(tipc_forward_buf2port);
 EXPORT_SYMBOL(tipc_multicast);
-/* EXPORT_SYMBOL(tipc_multicast_buf); not available yet */
-EXPORT_SYMBOL(tipc_ispublished);
-EXPORT_SYMBOL(tipc_available_nodes);
 
 /* TIPC API for external bearers (see tipc_bearer.h) */
 
@@ -293,6 +275,4 @@ EXPORT_SYMBOL(tipc_createport_raw);
 EXPORT_SYMBOL(tipc_reject_msg);
 EXPORT_SYMBOL(tipc_send_buf_fast);
 EXPORT_SYMBOL(tipc_acknowledge);
-EXPORT_SYMBOL(tipc_get_port);
-EXPORT_SYMBOL(tipc_get_handle);
 
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 188799017ab..e19389e5722 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -83,9 +83,7 @@
  * Note: TIPC_LOG is configured to echo its output to the system console;
  *       user-defined buffers can be configured to do the same thing.
  */
-
 extern struct print_buf *const TIPC_NULL;
-extern struct print_buf *const TIPC_CONS;
 extern struct print_buf *const TIPC_LOG;
 
 void tipc_printf(struct print_buf *, const char *fmt, ...);
@@ -204,10 +202,7 @@ extern atomic_t tipc_user_count;
  * Routines available to privileged subsystems
  */
 
-extern int  tipc_core_start(void);
-extern void tipc_core_stop(void);
-extern int  tipc_core_start_net(unsigned long addr);
-extern void tipc_core_stop_net(void);
+extern int tipc_core_start_net(unsigned long);
 extern int  tipc_handler_start(void);
 extern void tipc_handler_stop(void);
 extern int  tipc_netlink_start(void);
@@ -328,7 +323,7 @@ static inline struct tipc_msg *buf_msg(struct sk_buff *skb)
 	return (struct tipc_msg *)skb->data;
 }
 
-extern struct sk_buff *buf_acquire(u32 size);
+extern struct sk_buff *tipc_buf_acquire(u32 size);
 
 /**
  * buf_discard - frees a TIPC message buffer
diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c
index 1885a7edb0c..46f51d208e5 100644
--- a/net/tipc/dbg.c
+++ b/net/tipc/dbg.c
@@ -52,7 +52,7 @@ static struct print_buf null_buf = { NULL, 0, NULL, 0 };
 struct print_buf *const TIPC_NULL = &null_buf;
 
 static struct print_buf cons_buf = { NULL, 0, NULL, 1 };
-struct print_buf *const TIPC_CONS = &cons_buf;
+static struct print_buf *const TIPC_CONS = &cons_buf;
 
 static struct print_buf log_buf = { NULL, 0, NULL, 1 };
 struct print_buf *const TIPC_LOG = &log_buf;
@@ -76,6 +76,10 @@ struct print_buf *const TIPC_LOG = &log_buf;
 static char print_string[TIPC_PB_MAX_STR];
 static DEFINE_SPINLOCK(print_lock);
 
+static void tipc_printbuf_reset(struct print_buf *pb);
+static int  tipc_printbuf_empty(struct print_buf *pb);
+static void tipc_printbuf_move(struct print_buf *pb_to,
+			       struct print_buf *pb_from);
 
 #define FORMAT(PTR,LEN,FMT) \
 {\
@@ -116,7 +120,7 @@ void tipc_printbuf_init(struct print_buf *pb, char *raw, u32 size)
  * @pb: pointer to print buffer structure
  */
 
-void tipc_printbuf_reset(struct print_buf *pb)
+static void tipc_printbuf_reset(struct print_buf *pb)
 {
 	if (pb->buf) {
 		pb->crs = pb->buf;
@@ -132,9 +136,9 @@ void tipc_printbuf_reset(struct print_buf *pb)
  * Returns non-zero if print buffer is empty.
  */
 
-int tipc_printbuf_empty(struct print_buf *pb)
+static int tipc_printbuf_empty(struct print_buf *pb)
 {
-	return (!pb->buf || (pb->crs == pb->buf));
+	return !pb->buf || (pb->crs == pb->buf);
 }
 
 /**
@@ -169,7 +173,7 @@ int tipc_printbuf_validate(struct print_buf *pb)
 			tipc_printf(pb, err);
 		}
 	}
-	return (pb->crs - pb->buf + 1);
+	return pb->crs - pb->buf + 1;
 }
 
 /**
@@ -181,7 +185,8 @@ int tipc_printbuf_validate(struct print_buf *pb)
  * Source print buffer becomes empty if a successful move occurs.
  */
 
-void tipc_printbuf_move(struct print_buf *pb_to, struct print_buf *pb_from)
+static void tipc_printbuf_move(struct print_buf *pb_to,
+			       struct print_buf *pb_from)
 {
 	int len;
 
diff --git a/net/tipc/dbg.h b/net/tipc/dbg.h
index 5ef1bc8f64e..3ba6ba8b434 100644
--- a/net/tipc/dbg.h
+++ b/net/tipc/dbg.h
@@ -56,10 +56,7 @@ struct print_buf {
 #define TIPC_PB_MAX_STR 512	/* max printable string (with trailing NUL) */
 
 void tipc_printbuf_init(struct print_buf *pb, char *buf, u32 size);
-void tipc_printbuf_reset(struct print_buf *pb);
-int  tipc_printbuf_empty(struct print_buf *pb);
 int  tipc_printbuf_validate(struct print_buf *pb);
-void tipc_printbuf_move(struct print_buf *pb_to, struct print_buf *pb_from);
 
 int tipc_log_resize(int log_size);
 
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index fc1fcf5e6b5..4a7cd3719b7 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -46,16 +46,6 @@
 #define TIPC_LINK_REQ_FAST	2000	/* normal delay if bearer has no links */
 #define TIPC_LINK_REQ_SLOW	600000	/* normal delay if bearer has links */
 
-#if 0
-#define  GET_NODE_INFO         300
-#define  GET_NODE_INFO_RESULT  301
-#define  FORWARD_LINK_PROBE    302
-#define  LINK_REQUEST_REJECTED 303
-#define  LINK_REQUEST_ACCEPTED 304
-#define  DROP_LINK_REQUEST     305
-#define  CHECK_LINK_COUNT      306
-#endif
-
 /*
  * TODO: Most of the inter-cluster setup stuff should be
  * rewritten, and be made conformant with specification.
@@ -78,30 +68,6 @@ struct link_req {
 	unsigned int timer_intv;
 };
 
-
-#if 0
-int disc_create_link(const struct tipc_link_create *argv)
-{
-	/*
-	 * Code for inter cluster link setup here
-	 */
-	return TIPC_OK;
-}
-#endif
-
-/*
- * disc_lost_link(): A link has lost contact
- */
-
-void tipc_disc_link_event(u32 addr, char *name, int up)
-{
-	if (in_own_cluster(addr))
-		return;
-	/*
-	 * Code for inter cluster link setup here
-	 */
-}
-
 /**
  * tipc_disc_init_msg - initialize a link setup message
  * @type: message type (request or response)
@@ -115,7 +81,7 @@ static struct sk_buff *tipc_disc_init_msg(u32 type,
 					  u32 dest_domain,
 					  struct bearer *b_ptr)
 {
-	struct sk_buff *buf = buf_acquire(DSC_H_SIZE);
+	struct sk_buff *buf = tipc_buf_acquire(DSC_H_SIZE);
 	struct tipc_msg *msg;
 
 	if (buf) {
@@ -203,6 +169,14 @@ void tipc_disc_recv_msg(struct sk_buff *buf, struct bearer *b_ptr)
 				return;
 		}
 		spin_lock_bh(&n_ptr->lock);
+
+		/* Don't talk to neighbor during cleanup after last session */
+
+		if (n_ptr->cleanup_required) {
+			spin_unlock_bh(&n_ptr->lock);
+			return;
+		}
+
 		link = n_ptr->links[b_ptr->identity];
 		if (!link) {
 			dbg("creating link\n");
diff --git a/net/tipc/discover.h b/net/tipc/discover.h
index c36eaeb7d5d..f8e75063612 100644
--- a/net/tipc/discover.h
+++ b/net/tipc/discover.h
@@ -50,9 +50,4 @@ void tipc_disc_stop_link_req(struct link_req *req);
 
 void tipc_disc_recv_msg(struct sk_buff *buf, struct bearer *b_ptr);
 
-void tipc_disc_link_event(u32 addr, char *name, int up);
-#if 0
-int  disc_create_link(const struct tipc_link_create *argv);
-#endif
-
 #endif
diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c
index 6230d16020c..6e988ba485f 100644
--- a/net/tipc/eth_media.c
+++ b/net/tipc/eth_media.c
@@ -72,17 +72,26 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr,
 {
 	struct sk_buff *clone;
 	struct net_device *dev;
+	int delta;
 
 	clone = skb_clone(buf, GFP_ATOMIC);
-	if (clone) {
-		skb_reset_network_header(clone);
-		dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev;
-		clone->dev = dev;
-		dev_hard_header(clone, dev, ETH_P_TIPC,
-				 &dest->dev_addr.eth_addr,
-				 dev->dev_addr, clone->len);
-		dev_queue_xmit(clone);
+	if (!clone)
+		return 0;
+
+	dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev;
+	delta = dev->hard_header_len - skb_headroom(buf);
+
+	if ((delta > 0) &&
+	    pskb_expand_head(clone, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
+		kfree_skb(clone);
+		return 0;
 	}
+
+	skb_reset_network_header(clone);
+	clone->dev = dev;
+	dev_hard_header(clone, dev, ETH_P_TIPC, &dest->dev_addr.eth_addr,
+			dev->dev_addr, clone->len);
+	dev_queue_xmit(clone);
 	return 0;
 }
 
@@ -92,15 +101,12 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr,
  * Accept only packets explicitly sent to this node, or broadcast packets;
  * ignores packets sent using Ethernet multicast, and traffic sent to other
  * nodes (which can happen if interface is running in promiscuous mode).
- * Routine truncates any Ethernet padding/CRC appended to the message,
- * and ensures message size matches actual length
  */
 
 static int recv_msg(struct sk_buff *buf, struct net_device *dev,
 		    struct packet_type *pt, struct net_device *orig_dev)
 {
 	struct eth_bearer *eb_ptr = (struct eth_bearer *)pt->af_packet_priv;
-	u32 size;
 
 	if (!net_eq(dev_net(dev), &init_net)) {
 		kfree_skb(buf);
@@ -109,13 +115,9 @@ static int recv_msg(struct sk_buff *buf, struct net_device *dev,
 
 	if (likely(eb_ptr->bearer)) {
 		if (likely(buf->pkt_type <= PACKET_BROADCAST)) {
-			size = msg_size((struct tipc_msg *)buf->data);
-			skb_trim(buf, size);
-			if (likely(buf->len == size)) {
-				buf->next = NULL;
-				tipc_recv_msg(buf, eb_ptr->bearer);
-				return 0;
-			}
+			buf->next = NULL;
+			tipc_recv_msg(buf, eb_ptr->bearer);
+			return 0;
 		}
 	}
 	kfree_skb(buf);
@@ -133,6 +135,16 @@ static int enable_bearer(struct tipc_bearer *tb_ptr)
 	struct eth_bearer *eb_ptr = &eth_bearers[0];
 	struct eth_bearer *stop = &eth_bearers[MAX_ETH_BEARERS];
 	char *driver_name = strchr((const char *)tb_ptr->name, ':') + 1;
+	int pending_dev = 0;
+
+	/* Find unused Ethernet bearer structure */
+
+	while (eb_ptr->dev) {
+		if (!eb_ptr->bearer)
+			pending_dev++;
+		if (++eb_ptr == stop)
+			return pending_dev ? -EAGAIN : -EDQUOT;
+	}
 
 	/* Find device with specified name */
 
diff --git a/net/tipc/link.c b/net/tipc/link.c
index a3616b99529..b31992ccd5d 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -99,23 +99,6 @@ struct link_name {
 	char if_peer[TIPC_MAX_IF_NAME];
 };
 
-#if 0
-
-/* LINK EVENT CODE IS NOT SUPPORTED AT PRESENT */
-
-/**
- * struct link_event - link up/down event notification
- */
-
-struct link_event {
-	u32 addr;
-	int up;
-	void (*fcn)(u32, char *, int);
-	char name[TIPC_MAX_LINK_NAME];
-};
-
-#endif
-
 static void link_handle_out_of_seq_msg(struct link *l_ptr,
 				       struct sk_buff *buf);
 static void link_recv_proto_msg(struct link *l_ptr, struct sk_buff *buf);
@@ -129,6 +112,9 @@ static void link_state_event(struct link *l_ptr, u32 event);
 static void link_reset_statistics(struct link *l_ptr);
 static void link_print(struct link *l_ptr, struct print_buf *buf,
 		       const char *str);
+static void link_start(struct link *l_ptr);
+static int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf);
+
 
 /*
  * Debugging code used by link routines only
@@ -239,13 +225,13 @@ int tipc_link_is_up(struct link *l_ptr)
 {
 	if (!l_ptr)
 		return 0;
-	return (link_working_working(l_ptr) || link_working_unknown(l_ptr));
+	return link_working_working(l_ptr) || link_working_unknown(l_ptr);
 }
 
 int tipc_link_is_active(struct link *l_ptr)
 {
-	return ((l_ptr->owner->active_links[0] == l_ptr) ||
-		(l_ptr->owner->active_links[1] == l_ptr));
+	return	(l_ptr->owner->active_links[0] == l_ptr) ||
+		(l_ptr->owner->active_links[1] == l_ptr);
 }
 
 /**
@@ -459,7 +445,7 @@ struct link *tipc_link_create(struct bearer *b_ptr, const u32 peer,
 
 	k_init_timer(&l_ptr->timer, (Handler)link_timeout, (unsigned long)l_ptr);
 	list_add_tail(&l_ptr->link_list, &b_ptr->links);
-	tipc_k_signal((Handler)tipc_link_start, (unsigned long)l_ptr);
+	tipc_k_signal((Handler)link_start, (unsigned long)l_ptr);
 
 	dbg("tipc_link_create(): tolerance = %u,cont intv = %u, abort_limit = %u\n",
 	    l_ptr->tolerance, l_ptr->continuity_interval, l_ptr->abort_limit);
@@ -499,9 +485,9 @@ void tipc_link_delete(struct link *l_ptr)
 	kfree(l_ptr);
 }
 
-void tipc_link_start(struct link *l_ptr)
+static void link_start(struct link *l_ptr)
 {
-	dbg("tipc_link_start %x\n", l_ptr);
+	dbg("link_start %x\n", l_ptr);
 	link_state_event(l_ptr, STARTING_EVT);
 }
 
@@ -634,39 +620,9 @@ void tipc_link_stop(struct link *l_ptr)
 	l_ptr->proto_msg_queue = NULL;
 }
 
-#if 0
-
 /* LINK EVENT CODE IS NOT SUPPORTED AT PRESENT */
-
-static void link_recv_event(struct link_event *ev)
-{
-	ev->fcn(ev->addr, ev->name, ev->up);
-	kfree(ev);
-}
-
-static void link_send_event(void (*fcn)(u32 a, char *n, int up),
-			    struct link *l_ptr, int up)
-{
-	struct link_event *ev;
-
-	ev = kmalloc(sizeof(*ev), GFP_ATOMIC);
-	if (!ev) {
-		warn("Link event allocation failure\n");
-		return;
-	}
-	ev->addr = l_ptr->addr;
-	ev->up = up;
-	ev->fcn = fcn;
-	memcpy(ev->name, l_ptr->name, TIPC_MAX_LINK_NAME);
-	tipc_k_signal((Handler)link_recv_event, (unsigned long)ev);
-}
-
-#else
-
 #define link_send_event(fcn, l_ptr, up) do { } while (0)
 
-#endif
-
 void tipc_link_reset(struct link *l_ptr)
 {
 	struct sk_buff *buf;
@@ -690,10 +646,7 @@ void tipc_link_reset(struct link *l_ptr)
 
 	tipc_node_link_down(l_ptr->owner, l_ptr);
 	tipc_bearer_remove_dest(l_ptr->b_ptr, l_ptr->addr);
-#if 0
-	tipc_printf(TIPC_CONS, "\nReset link <%s>\n", l_ptr->name);
-	dbg_link_dump();
-#endif
+
 	if (was_active_link && tipc_node_has_active_links(l_ptr->owner) &&
 	    l_ptr->owner->permit_changeover) {
 		l_ptr->reset_checkpoint = checkpoint;
@@ -1050,7 +1003,7 @@ int tipc_link_send_buf(struct link *l_ptr, struct sk_buff *buf)
 	/* Fragmentation needed ? */
 
 	if (size > max_packet)
-		return tipc_link_send_long_buf(l_ptr, buf);
+		return link_send_long_buf(l_ptr, buf);
 
 	/* Packet can be queued or sent: */
 
@@ -1086,7 +1039,7 @@ int tipc_link_send_buf(struct link *l_ptr, struct sk_buff *buf)
 		/* Try creating a new bundle */
 
 		if (size <= max_packet * 2 / 3) {
-			struct sk_buff *bundler = buf_acquire(max_packet);
+			struct sk_buff *bundler = tipc_buf_acquire(max_packet);
 			struct tipc_msg bundler_hdr;
 
 			if (bundler) {
@@ -1362,7 +1315,7 @@ again:
 
 	/* Prepare header of first fragment: */
 
-	buf_chain = buf = buf_acquire(max_pkt);
+	buf_chain = buf = tipc_buf_acquire(max_pkt);
 	if (!buf)
 		return -ENOMEM;
 	buf->next = NULL;
@@ -1419,7 +1372,7 @@ error:
 			msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE);
 			msg_set_fragm_no(&fragm_hdr, ++fragm_no);
 			prev = buf;
-			buf = buf_acquire(fragm_sz + INT_H_SIZE);
+			buf = tipc_buf_acquire(fragm_sz + INT_H_SIZE);
 			if (!buf)
 				goto error;
 
@@ -1802,6 +1755,15 @@ static int link_recv_buf_validate(struct sk_buff *buf)
 	return pskb_may_pull(buf, hdr_size);
 }
 
+/**
+ * tipc_recv_msg - process TIPC messages arriving from off-node
+ * @head: pointer to message buffer chain
+ * @tb_ptr: pointer to bearer message arrived on
+ *
+ * Invoked with no locks held.  Bearer pointer must point to a valid bearer
+ * structure (i.e. cannot be NULL), but bearer can be inactive.
+ */
+
 void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr)
 {
 	read_lock_bh(&tipc_net_lock);
@@ -1819,6 +1781,11 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr)
 
 		head = head->next;
 
+		/* Ensure bearer is still enabled */
+
+		if (unlikely(!b_ptr->active))
+			goto cont;
+
 		/* Ensure message is well-formed */
 
 		if (unlikely(!link_recv_buf_validate(buf)))
@@ -1855,13 +1822,22 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr)
 				goto cont;
 		}
 
-		/* Locate unicast link endpoint that should handle message */
+		/* Locate neighboring node that sent message */
 
 		n_ptr = tipc_node_find(msg_prevnode(msg));
 		if (unlikely(!n_ptr))
 			goto cont;
 		tipc_node_lock(n_ptr);
 
+		/* Don't talk to neighbor during cleanup after last session */
+
+		if (n_ptr->cleanup_required) {
+			tipc_node_unlock(n_ptr);
+			goto cont;
+		}
+
+		/* Locate unicast link endpoint that should handle message */
+
 		l_ptr = n_ptr->links[b_ptr->identity];
 		if (unlikely(!l_ptr)) {
 			tipc_node_unlock(n_ptr);
@@ -2172,7 +2148,7 @@ void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg,
 	if (tipc_bearer_congested(l_ptr->b_ptr, l_ptr)) {
 		if (!l_ptr->proto_msg_queue) {
 			l_ptr->proto_msg_queue =
-				buf_acquire(sizeof(l_ptr->proto_msg));
+				tipc_buf_acquire(sizeof(l_ptr->proto_msg));
 		}
 		buf = l_ptr->proto_msg_queue;
 		if (!buf)
@@ -2186,7 +2162,7 @@ void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg,
 
 	msg_dbg(msg, ">>");
 
-	buf = buf_acquire(msg_size);
+	buf = tipc_buf_acquire(msg_size);
 	if (!buf)
 		return;
 
@@ -2345,10 +2321,10 @@ exit:
  * tipc_link_tunnel(): Send one message via a link belonging to
  * another bearer. Owner node is locked.
  */
-void tipc_link_tunnel(struct link *l_ptr,
-		      struct tipc_msg *tunnel_hdr,
-		      struct tipc_msg  *msg,
-		      u32 selector)
+static void tipc_link_tunnel(struct link *l_ptr,
+			     struct tipc_msg *tunnel_hdr,
+			     struct tipc_msg  *msg,
+			     u32 selector)
 {
 	struct link *tunnel;
 	struct sk_buff *buf;
@@ -2361,7 +2337,7 @@ void tipc_link_tunnel(struct link *l_ptr,
 		return;
 	}
 	msg_set_size(tunnel_hdr, length + INT_H_SIZE);
-	buf = buf_acquire(length + INT_H_SIZE);
+	buf = tipc_buf_acquire(length + INT_H_SIZE);
 	if (!buf) {
 		warn("Link changeover error, "
 		     "unable to send tunnel msg\n");
@@ -2407,7 +2383,7 @@ void tipc_link_changeover(struct link *l_ptr)
 	if (!l_ptr->first_out) {
 		struct sk_buff *buf;
 
-		buf = buf_acquire(INT_H_SIZE);
+		buf = tipc_buf_acquire(INT_H_SIZE);
 		if (buf) {
 			skb_copy_to_linear_data(buf, &tunnel_hdr, INT_H_SIZE);
 			msg_set_size(&tunnel_hdr, INT_H_SIZE);
@@ -2468,7 +2444,7 @@ void tipc_link_send_duplicate(struct link *l_ptr, struct link *tunnel)
 		msg_set_ack(msg, mod(l_ptr->next_in_no - 1));	/* Update */
 		msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in);
 		msg_set_size(&tunnel_hdr, length + INT_H_SIZE);
-		outbuf = buf_acquire(length + INT_H_SIZE);
+		outbuf = tipc_buf_acquire(length + INT_H_SIZE);
 		if (outbuf == NULL) {
 			warn("Link changeover error, "
 			     "unable to send duplicate msg\n");
@@ -2504,7 +2480,7 @@ static struct sk_buff *buf_extract(struct sk_buff *skb, u32 from_pos)
 	u32 size = msg_size(msg);
 	struct sk_buff *eb;
 
-	eb = buf_acquire(size);
+	eb = tipc_buf_acquire(size);
 	if (eb)
 		skb_copy_to_linear_data(eb, msg, size);
 	return eb;
@@ -2632,11 +2608,11 @@ void tipc_link_recv_bundle(struct sk_buff *buf)
 
 
 /*
- * tipc_link_send_long_buf: Entry for buffers needing fragmentation.
+ * link_send_long_buf: Entry for buffers needing fragmentation.
  * The buffer is complete, inclusive total message length.
  * Returns user data length.
  */
-int tipc_link_send_long_buf(struct link *l_ptr, struct sk_buff *buf)
+static int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf)
 {
 	struct tipc_msg *inmsg = buf_msg(buf);
 	struct tipc_msg fragm_hdr;
@@ -2675,7 +2651,7 @@ int tipc_link_send_long_buf(struct link *l_ptr, struct sk_buff *buf)
 			fragm_sz = rest;
 			msg_set_type(&fragm_hdr, LAST_FRAGMENT);
 		}
-		fragm = buf_acquire(fragm_sz + INT_H_SIZE);
+		fragm = tipc_buf_acquire(fragm_sz + INT_H_SIZE);
 		if (fragm == NULL) {
 			warn("Link unable to fragment message\n");
 			dsz = -ENOMEM;
@@ -2780,7 +2756,7 @@ int tipc_link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb,
 			buf_discard(fbuf);
 			return 0;
 		}
-		pbuf = buf_acquire(msg_size(imsg));
+		pbuf = tipc_buf_acquire(msg_size(imsg));
 		if (pbuf != NULL) {
 			pbuf->next = *pending;
 			*pending = pbuf;
@@ -3174,44 +3150,6 @@ struct sk_buff *tipc_link_cmd_show_stats(const void *req_tlv_area, int req_tlv_s
 	return buf;
 }
 
-#if 0
-int link_control(const char *name, u32 op, u32 val)
-{
-	int res = -EINVAL;
-	struct link *l_ptr;
-	u32 bearer_id;
-	struct tipc_node * node;
-	u32 a;
-
-	a = link_name2addr(name, &bearer_id);
-	read_lock_bh(&tipc_net_lock);
-	node = tipc_node_find(a);
-	if (node) {
-		tipc_node_lock(node);
-		l_ptr = node->links[bearer_id];
-		if (l_ptr) {
-			if (op == TIPC_REMOVE_LINK) {
-				struct bearer *b_ptr = l_ptr->b_ptr;
-				spin_lock_bh(&b_ptr->publ.lock);
-				tipc_link_delete(l_ptr);
-				spin_unlock_bh(&b_ptr->publ.lock);
-			}
-			if (op == TIPC_CMD_BLOCK_LINK) {
-				tipc_link_reset(l_ptr);
-				l_ptr->blocked = 1;
-			}
-			if (op == TIPC_CMD_UNBLOCK_LINK) {
-				l_ptr->blocked = 0;
-			}
-			res = 0;
-		}
-		tipc_node_unlock(node);
-	}
-	read_unlock_bh(&tipc_net_lock);
-	return res;
-}
-#endif
-
 /**
  * tipc_link_get_max_pkt - get maximum packet size to use when sending to destination
  * @dest: network address of destination node
@@ -3242,28 +3180,6 @@ u32 tipc_link_get_max_pkt(u32 dest, u32 selector)
 	return res;
 }
 
-#if 0
-static void link_dump_rec_queue(struct link *l_ptr)
-{
-	struct sk_buff *crs;
-
-	if (!l_ptr->oldest_deferred_in) {
-		info("Reception queue empty\n");
-		return;
-	}
-	info("Contents of Reception queue:\n");
-	crs = l_ptr->oldest_deferred_in;
-	while (crs) {
-		if (crs->data == (void *)0x0000a3a3) {
-			info("buffer %x invalid\n", crs);
-			return;
-		}
-		msg_dbg(buf_msg(crs), "In rec queue:\n");
-		crs = crs->next;
-	}
-}
-#endif
-
 static void link_dump_send_queue(struct link *l_ptr)
 {
 	if (l_ptr->next_out) {
diff --git a/net/tipc/link.h b/net/tipc/link.h
index 2e5385c47d3..f98bc613de6 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -210,10 +210,6 @@ struct link {
 		u32 msg_length_counts;
 		u32 msg_lengths_total;
 		u32 msg_length_profile[7];
-#if 0
-		u32 sent_tunneled;
-		u32 recv_tunneled;
-#endif
 	} stats;
 
 	struct print_buf print_buf;
@@ -229,7 +225,6 @@ void tipc_link_send_duplicate(struct link *l_ptr, struct link *dest);
 void tipc_link_reset_fragments(struct link *l_ptr);
 int tipc_link_is_up(struct link *l_ptr);
 int tipc_link_is_active(struct link *l_ptr);
-void tipc_link_start(struct link *l_ptr);
 u32 tipc_link_push_packet(struct link *l_ptr);
 void tipc_link_stop(struct link *l_ptr);
 struct sk_buff *tipc_link_cmd_config(const void *req_tlv_area, int req_tlv_space, u16 cmd);
@@ -243,9 +238,6 @@ int tipc_link_send_sections_fast(struct port* sender,
 				 struct iovec const *msg_sect,
 				 const u32 num_sect,
 				 u32 destnode);
-int tipc_link_send_long_buf(struct link *l_ptr, struct sk_buff *buf);
-void tipc_link_tunnel(struct link *l_ptr, struct tipc_msg *tnl_hdr,
-		      struct tipc_msg *msg, u32 selector);
 void tipc_link_recv_bundle(struct sk_buff *buf);
 int  tipc_link_recv_fragment(struct sk_buff **pending,
 			     struct sk_buff **fb,
@@ -279,12 +271,12 @@ static inline int between(u32 lower, u32 upper, u32 n)
 
 static inline int less_eq(u32 left, u32 right)
 {
-	return (mod(right - left) < 32768u);
+	return mod(right - left) < 32768u;
 }
 
 static inline int less(u32 left, u32 right)
 {
-	return (less_eq(left, right) && (mod(right) != mod(left)));
+	return less_eq(left, right) && (mod(right) != mod(left));
 }
 
 static inline u32 lesser(u32 left, u32 right)
@@ -299,32 +291,32 @@ static inline u32 lesser(u32 left, u32 right)
 
 static inline int link_working_working(struct link *l_ptr)
 {
-	return (l_ptr->state == WORKING_WORKING);
+	return l_ptr->state == WORKING_WORKING;
 }
 
 static inline int link_working_unknown(struct link *l_ptr)
 {
-	return (l_ptr->state == WORKING_UNKNOWN);
+	return l_ptr->state == WORKING_UNKNOWN;
 }
 
 static inline int link_reset_unknown(struct link *l_ptr)
 {
-	return (l_ptr->state == RESET_UNKNOWN);
+	return l_ptr->state == RESET_UNKNOWN;
 }
 
 static inline int link_reset_reset(struct link *l_ptr)
 {
-	return (l_ptr->state == RESET_RESET);
+	return l_ptr->state == RESET_RESET;
 }
 
 static inline int link_blocked(struct link *l_ptr)
 {
-	return (l_ptr->exp_msg_count || l_ptr->blocked);
+	return l_ptr->exp_msg_count || l_ptr->blocked;
 }
 
 static inline int link_congested(struct link *l_ptr)
 {
-	return (l_ptr->out_queue_size >= l_ptr->queue_limit[0]);
+	return l_ptr->out_queue_size >= l_ptr->queue_limit[0];
 }
 
 #endif
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 381063817b4..ecb532fb035 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -112,7 +112,7 @@ int tipc_msg_build(struct tipc_msg *hdr,
 		return dsz;
 	}
 
-	*buf = buf_acquire(sz);
+	*buf = tipc_buf_acquire(sz);
 	if (!(*buf))
 		return -ENOMEM;
 	skb_copy_to_linear_data(*buf, hdr, hsz);
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 995d2da35b0..031aad18efc 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -104,7 +104,7 @@ static inline u32 msg_user(struct tipc_msg *m)
 
 static inline u32 msg_isdata(struct tipc_msg *m)
 {
-	return (msg_user(m) <= TIPC_CRITICAL_IMPORTANCE);
+	return msg_user(m) <= TIPC_CRITICAL_IMPORTANCE;
 }
 
 static inline void msg_set_user(struct tipc_msg *m, u32 n)
@@ -289,7 +289,7 @@ static inline void msg_set_destnode(struct tipc_msg *m, u32 a)
 
 static inline int msg_is_dest(struct tipc_msg *m, u32 d)
 {
-	return(msg_short(m) || (msg_destnode(m) == d));
+	return msg_short(m) || (msg_destnode(m) == d);
 }
 
 static inline u32 msg_routed(struct tipc_msg *m)
@@ -632,7 +632,7 @@ static inline void msg_set_bcast_tag(struct tipc_msg *m, u32 n)
 
 static inline u32 msg_max_pkt(struct tipc_msg *m)
 {
-	return (msg_bits(m, 9, 16, 0xffff) * 4);
+	return msg_bits(m, 9, 16, 0xffff) * 4;
 }
 
 static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n)
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 6ac3c543250..7b907171f87 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -98,7 +98,7 @@ static void publ_to_item(struct distr_item *i, struct publication *p)
 
 static struct sk_buff *named_prepare_buf(u32 type, u32 size, u32 dest)
 {
-	struct sk_buff *buf = buf_acquire(LONG_H_SIZE + size);
+	struct sk_buff *buf = tipc_buf_acquire(LONG_H_SIZE + size);
 	struct tipc_msg *msg;
 
 	if (buf != NULL) {
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 8ba79620db3..3a8de4334da 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -116,7 +116,7 @@ DEFINE_RWLOCK(tipc_nametbl_lock);
 
 static int hash(int x)
 {
-	return(x & (tipc_nametbl_size - 1));
+	return x & (tipc_nametbl_size - 1);
 }
 
 /**
@@ -613,8 +613,7 @@ struct publication *tipc_nametbl_remove_publ(u32 type, u32 lower,
 }
 
 /*
- * tipc_nametbl_translate(): Translate tipc_name -> tipc_portid.
- *                      Very time-critical.
+ * tipc_nametbl_translate - translate name to port id
  *
  * Note: on entry 'destnode' is the search domain used during translation;
  *       on exit it passes back the node address of the matching port (if any)
@@ -685,7 +684,6 @@ found:
 	}
 	spin_unlock_bh(&seq->lock);
 not_found:
-	*destnode = 0;
 	read_unlock_bh(&tipc_nametbl_lock);
 	return 0;
 }
@@ -877,7 +875,7 @@ static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth,
 			u32 index)
 {
 	char portIdStr[27];
-	char *scopeStr;
+	const char *scope_str[] = {"", " zone", " cluster", " node"};
 	struct publication *publ = sseq->zone_list;
 
 	tipc_printf(buf, "%-10u %-10u ", sseq->lower, sseq->upper);
@@ -893,15 +891,8 @@ static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth,
 			 tipc_node(publ->node), publ->ref);
 		tipc_printf(buf, "%-26s ", portIdStr);
 		if (depth > 3) {
-			if (publ->node != tipc_own_addr)
-				scopeStr = "";
-			else if (publ->scope == TIPC_NODE_SCOPE)
-				scopeStr = "node";
-			else if (publ->scope == TIPC_CLUSTER_SCOPE)
-				scopeStr = "cluster";
-			else
-				scopeStr = "zone";
-			tipc_printf(buf, "%-10u %s", publ->key, scopeStr);
+			tipc_printf(buf, "%-10u %s", publ->key,
+				    scope_str[publ->scope]);
 		}
 
 		publ = publ->zone_list_next;
@@ -951,24 +942,19 @@ static void nameseq_list(struct name_seq *seq, struct print_buf *buf, u32 depth,
 
 static void nametbl_header(struct print_buf *buf, u32 depth)
 {
-	tipc_printf(buf, "Type       ");
-
-	if (depth > 1)
-		tipc_printf(buf, "Lower      Upper      ");
-	if (depth > 2)
-		tipc_printf(buf, "Port Identity              ");
-	if (depth > 3)
-		tipc_printf(buf, "Publication");
-
-	tipc_printf(buf, "\n-----------");
-
-	if (depth > 1)
-		tipc_printf(buf, "--------------------- ");
-	if (depth > 2)
-		tipc_printf(buf, "-------------------------- ");
-	if (depth > 3)
-		tipc_printf(buf, "------------------");
-
+	const char *header[] = {
+		"Type       ",
+		"Lower      Upper      ",
+		"Port Identity              ",
+		"Publication Scope"
+	};
+
+	int i;
+
+	if (depth > 4)
+		depth = 4;
+	for (i = 0; i < depth; i++)
+		tipc_printf(buf, header[i]);
 	tipc_printf(buf, "\n");
 }
 
@@ -1023,16 +1009,6 @@ static void nametbl_list(struct print_buf *buf, u32 depth_info,
 	}
 }
 
-#if 0
-void tipc_nametbl_print(struct print_buf *buf, const char *str)
-{
-	tipc_printf(buf, str);
-	read_lock_bh(&tipc_nametbl_lock);
-	nametbl_list(buf, 0, 0, 0, 0);
-	read_unlock_bh(&tipc_nametbl_lock);
-}
-#endif
-
 #define MAX_NAME_TBL_QUERY 32768
 
 struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space)
@@ -1065,13 +1041,6 @@ struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space)
 	return buf;
 }
 
-#if 0
-void tipc_nametbl_dump(void)
-{
-	nametbl_list(TIPC_CONS, 0, 0, 0, 0);
-}
-#endif
-
 int tipc_nametbl_init(void)
 {
 	table.types = kcalloc(tipc_nametbl_size, sizeof(struct hlist_head),
diff --git a/net/tipc/net.c b/net/tipc/net.c
index f61b7694138..1a621cfd660 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -129,15 +129,6 @@ u32 tipc_net_select_router(u32 addr, u32 ref)
 	return tipc_zone_select_router(tipc_net.zones[tipc_zone(addr)], addr, ref);
 }
 
-#if 0
-u32 tipc_net_next_node(u32 a)
-{
-	if (tipc_net.zones[tipc_zone(a)])
-		return tipc_zone_next_node(a);
-	return 0;
-}
-#endif
-
 void tipc_net_remove_as_router(u32 router)
 {
 	u32 z_num;
@@ -248,6 +239,7 @@ void tipc_net_route_msg(struct sk_buff *buf)
 
 	/* Handle message for another node */
 	msg_dbg(msg, "NET>SEND>: ");
+	skb_trim(buf, msg_size(msg));
 	tipc_link_send(buf, dnode, msg_link_selector(msg));
 }
 
diff --git a/net/tipc/node.c b/net/tipc/node.c
index b634942caba..b4d87eb2dc5 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -50,7 +50,8 @@ void node_print(struct print_buf *buf, struct tipc_node *n_ptr, char *str);
 static void node_lost_contact(struct tipc_node *n_ptr);
 static void node_established_contact(struct tipc_node *n_ptr);
 
-struct tipc_node *tipc_nodes = NULL;	/* sorted list of nodes within cluster */
+/* sorted list of nodes within cluster */
+static struct tipc_node *tipc_nodes = NULL;
 
 static DEFINE_SPINLOCK(node_create_lock);
 
@@ -125,16 +126,6 @@ void tipc_node_delete(struct tipc_node *n_ptr)
 	if (!n_ptr)
 		return;
 
-#if 0
-	/* Not needed because links are already deleted via tipc_bearer_stop() */
-
-	u32 l_num;
-
-	for (l_num = 0; l_num < MAX_BEARERS; l_num++) {
-		link_delete(n_ptr->links[l_num]);
-	}
-#endif
-
 	dbg("node %x deleted\n", n_ptr->addr);
 	kfree(n_ptr);
 }
@@ -237,23 +228,22 @@ void tipc_node_link_down(struct tipc_node *n_ptr, struct link *l_ptr)
 
 int tipc_node_has_active_links(struct tipc_node *n_ptr)
 {
-	return (n_ptr &&
-		((n_ptr->active_links[0]) || (n_ptr->active_links[1])));
+	return n_ptr->active_links[0] != NULL;
 }
 
 int tipc_node_has_redundant_links(struct tipc_node *n_ptr)
 {
-	return (n_ptr->working_links > 1);
+	return n_ptr->working_links > 1;
 }
 
 static int tipc_node_has_active_routes(struct tipc_node *n_ptr)
 {
-	return (n_ptr && (n_ptr->last_router >= 0));
+	return n_ptr && (n_ptr->last_router >= 0);
 }
 
 int tipc_node_is_up(struct tipc_node *n_ptr)
 {
-	return (tipc_node_has_active_links(n_ptr) || tipc_node_has_active_routes(n_ptr));
+	return tipc_node_has_active_links(n_ptr) || tipc_node_has_active_routes(n_ptr);
 }
 
 struct tipc_node *tipc_node_attach_link(struct link *l_ptr)
@@ -384,6 +374,20 @@ static void node_established_contact(struct tipc_node *n_ptr)
 				  tipc_highest_allowed_slave);
 }
 
+static void node_cleanup_finished(unsigned long node_addr)
+{
+	struct tipc_node *n_ptr;
+
+	read_lock_bh(&tipc_net_lock);
+	n_ptr = tipc_node_find(node_addr);
+	if (n_ptr) {
+		tipc_node_lock(n_ptr);
+		n_ptr->cleanup_required = 0;
+		tipc_node_unlock(n_ptr);
+	}
+	read_unlock_bh(&tipc_net_lock);
+}
+
 static void node_lost_contact(struct tipc_node *n_ptr)
 {
 	struct cluster *c_ptr;
@@ -458,6 +462,11 @@ static void node_lost_contact(struct tipc_node *n_ptr)
 		tipc_k_signal((Handler)ns->handle_node_down,
 			      (unsigned long)ns->usr_handle);
 	}
+
+	/* Prevent re-contact with node until all cleanup is done */
+
+	n_ptr->cleanup_required = 1;
+	tipc_k_signal((Handler)node_cleanup_finished, n_ptr->addr);
 }
 
 /**
@@ -579,38 +588,6 @@ void tipc_node_remove_router(struct tipc_node *n_ptr, u32 router)
 		node_lost_contact(n_ptr);
 }
 
-#if 0
-void node_print(struct print_buf *buf, struct tipc_node *n_ptr, char *str)
-{
-	u32 i;
-
-	tipc_printf(buf, "\n\n%s", str);
-	for (i = 0; i < MAX_BEARERS; i++) {
-		if (!n_ptr->links[i])
-			continue;
-		tipc_printf(buf, "Links[%u]: %x, ", i, n_ptr->links[i]);
-	}
-	tipc_printf(buf, "Active links: [%x,%x]\n",
-		    n_ptr->active_links[0], n_ptr->active_links[1]);
-}
-#endif
-
-u32 tipc_available_nodes(const u32 domain)
-{
-	struct tipc_node *n_ptr;
-	u32 cnt = 0;
-
-	read_lock_bh(&tipc_net_lock);
-	for (n_ptr = tipc_nodes; n_ptr; n_ptr = n_ptr->next) {
-		if (!tipc_in_scope(domain, n_ptr->addr))
-			continue;
-		if (tipc_node_is_up(n_ptr))
-			cnt++;
-	}
-	read_unlock_bh(&tipc_net_lock);
-	return cnt;
-}
-
 struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space)
 {
 	u32 domain;
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 6f990da5d14..fff331b2d26 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -52,6 +52,7 @@
  * @active_links: pointers to active links to node
  * @links: pointers to all links to node
  * @working_links: number of working links to node (both active and standby)
+ * @cleanup_required: non-zero if cleaning up after a prior loss of contact
  * @link_cnt: number of links to node
  * @permit_changeover: non-zero if node has redundant links to this system
  * @routers: bitmap (used for multicluster communication)
@@ -78,6 +79,7 @@ struct tipc_node {
 	struct link *links[MAX_BEARERS];
 	int link_cnt;
 	int working_links;
+	int cleanup_required;
 	int permit_changeover;
 	u32 routers[512/32];
 	int last_router;
@@ -94,7 +96,6 @@ struct tipc_node {
 	} bclink;
 };
 
-extern struct tipc_node *tipc_nodes;
 extern u32 tipc_own_tag;
 
 struct tipc_node *tipc_node_create(u32 addr);
diff --git a/net/tipc/port.c b/net/tipc/port.c
index 0737680e926..82092eaa153 100644
--- a/net/tipc/port.c
+++ b/net/tipc/port.c
@@ -293,34 +293,6 @@ int tipc_deleteport(u32 ref)
 	return 0;
 }
 
-/**
- * tipc_get_port() - return port associated with 'ref'
- *
- * Note: Port is not locked.
- */
-
-struct tipc_port *tipc_get_port(const u32 ref)
-{
-	return (struct tipc_port *)tipc_ref_deref(ref);
-}
-
-/**
- * tipc_get_handle - return user handle associated to port 'ref'
- */
-
-void *tipc_get_handle(const u32 ref)
-{
-	struct port *p_ptr;
-	void * handle;
-
-	p_ptr = tipc_port_lock(ref);
-	if (!p_ptr)
-		return NULL;
-	handle = p_ptr->publ.usr_handle;
-	tipc_port_unlock(p_ptr);
-	return handle;
-}
-
 static int port_unreliable(struct port *p_ptr)
 {
 	return msg_src_droppable(&p_ptr->publ.phdr);
@@ -392,7 +364,7 @@ static struct sk_buff *port_build_proto_msg(u32 destport, u32 destnode,
 	struct sk_buff *buf;
 	struct tipc_msg *msg;
 
-	buf = buf_acquire(LONG_H_SIZE);
+	buf = tipc_buf_acquire(LONG_H_SIZE);
 	if (buf) {
 		msg = buf_msg(buf);
 		tipc_msg_init(msg, usr, type, LONG_H_SIZE, destnode);
@@ -433,7 +405,7 @@ int tipc_reject_msg(struct sk_buff *buf, u32 err)
 		hdr_sz = MCAST_H_SIZE;
 	else
 		hdr_sz = LONG_H_SIZE;
-	rbuf = buf_acquire(data_sz + hdr_sz);
+	rbuf = tipc_buf_acquire(data_sz + hdr_sz);
 	if (rbuf == NULL) {
 		buf_discard(buf);
 		return data_sz;
@@ -588,19 +560,10 @@ void tipc_port_recv_proto_msg(struct sk_buff *buf)
 	if (!p_ptr) {
 		err = TIPC_ERR_NO_PORT;
 	} else if (p_ptr->publ.connected) {
-		if (port_peernode(p_ptr) != msg_orignode(msg))
+		if ((port_peernode(p_ptr) != msg_orignode(msg)) ||
+		    (port_peerport(p_ptr) != msg_origport(msg))) {
 			err = TIPC_ERR_NO_PORT;
-		if (port_peerport(p_ptr) != msg_origport(msg))
-			err = TIPC_ERR_NO_PORT;
-		if (!err && msg_routed(msg)) {
-			u32 seqno = msg_transp_seqno(msg);
-			u32 myno =  ++p_ptr->last_in_seqno;
-			if (seqno != myno) {
-				err = TIPC_ERR_NO_PORT;
-				abort_buf = port_build_self_abort_msg(p_ptr, err);
-			}
-		}
-		if (msg_type(msg) == CONN_ACK) {
+		} else if (msg_type(msg) == CONN_ACK) {
 			int wakeup = tipc_port_congested(p_ptr) &&
 				     p_ptr->publ.congested &&
 				     p_ptr->wakeup;
@@ -719,50 +682,6 @@ struct sk_buff *tipc_port_get_ports(void)
 	return buf;
 }
 
-#if 0
-
-#define MAX_PORT_STATS 2000
-
-struct sk_buff *port_show_stats(const void *req_tlv_area, int req_tlv_space)
-{
-	u32 ref;
-	struct port *p_ptr;
-	struct sk_buff *buf;
-	struct tlv_desc *rep_tlv;
-	struct print_buf pb;
-	int str_len;
-
-	if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_PORT_REF))
-		return cfg_reply_error_string(TIPC_CFG_TLV_ERROR);
-
-	ref = *(u32 *)TLV_DATA(req_tlv_area);
-	ref = ntohl(ref);
-
-	p_ptr = tipc_port_lock(ref);
-	if (!p_ptr)
-		return cfg_reply_error_string("port not found");
-
-	buf = tipc_cfg_reply_alloc(TLV_SPACE(MAX_PORT_STATS));
-	if (!buf) {
-		tipc_port_unlock(p_ptr);
-		return NULL;
-	}
-	rep_tlv = (struct tlv_desc *)buf->data;
-
-	tipc_printbuf_init(&pb, TLV_DATA(rep_tlv), MAX_PORT_STATS);
-	port_print(p_ptr, &pb, 1);
-	/* NEED TO FILL IN ADDITIONAL PORT STATISTICS HERE */
-	tipc_port_unlock(p_ptr);
-	str_len = tipc_printbuf_validate(&pb);
-
-	skb_put(buf, TLV_SPACE(str_len));
-	TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len);
-
-	return buf;
-}
-
-#endif
-
 void tipc_port_reinit(void)
 {
 	struct port *p_ptr;
@@ -1295,50 +1214,13 @@ int tipc_shutdown(u32 ref)
 	return tipc_disconnect(ref);
 }
 
-int tipc_isconnected(u32 ref, int *isconnected)
-{
-	struct port *p_ptr;
-
-	p_ptr = tipc_port_lock(ref);
-	if (!p_ptr)
-		return -EINVAL;
-	*isconnected = p_ptr->publ.connected;
-	tipc_port_unlock(p_ptr);
-	return 0;
-}
-
-int tipc_peer(u32 ref, struct tipc_portid *peer)
-{
-	struct port *p_ptr;
-	int res;
-
-	p_ptr = tipc_port_lock(ref);
-	if (!p_ptr)
-		return -EINVAL;
-	if (p_ptr->publ.connected) {
-		peer->ref = port_peerport(p_ptr);
-		peer->node = port_peernode(p_ptr);
-		res = 0;
-	} else
-		res = -ENOTCONN;
-	tipc_port_unlock(p_ptr);
-	return res;
-}
-
-int tipc_ref_valid(u32 ref)
-{
-	/* Works irrespective of type */
-	return !!tipc_ref_deref(ref);
-}
-
-
 /*
  *  tipc_port_recv_sections(): Concatenate and deliver sectioned
  *                        message for this node.
  */
 
-int tipc_port_recv_sections(struct port *sender, unsigned int num_sect,
-		       struct iovec const *msg_sect)
+static int tipc_port_recv_sections(struct port *sender, unsigned int num_sect,
+				   struct iovec const *msg_sect)
 {
 	struct sk_buff *buf;
 	int res;
@@ -1389,65 +1271,16 @@ int tipc_send(u32 ref, unsigned int num_sect, struct iovec const *msg_sect)
 }
 
 /**
- * tipc_send_buf - send message buffer on connection
- */
-
-int tipc_send_buf(u32 ref, struct sk_buff *buf, unsigned int dsz)
-{
-	struct port *p_ptr;
-	struct tipc_msg *msg;
-	u32 destnode;
-	u32 hsz;
-	u32 sz;
-	u32 res;
-
-	p_ptr = tipc_port_deref(ref);
-	if (!p_ptr || !p_ptr->publ.connected)
-		return -EINVAL;
-
-	msg = &p_ptr->publ.phdr;
-	hsz = msg_hdr_sz(msg);
-	sz = hsz + dsz;
-	msg_set_size(msg, sz);
-	if (skb_cow(buf, hsz))
-		return -ENOMEM;
-
-	skb_push(buf, hsz);
-	skb_copy_to_linear_data(buf, msg, hsz);
-	destnode = msg_destnode(msg);
-	p_ptr->publ.congested = 1;
-	if (!tipc_port_congested(p_ptr)) {
-		if (likely(destnode != tipc_own_addr))
-			res = tipc_send_buf_fast(buf, destnode);
-		else {
-			tipc_port_recv_msg(buf);
-			res = sz;
-		}
-		if (likely(res != -ELINKCONG)) {
-			port_incr_out_seqno(p_ptr);
-			p_ptr->sent++;
-			p_ptr->publ.congested = 0;
-			return res;
-		}
-	}
-	if (port_unreliable(p_ptr)) {
-		p_ptr->publ.congested = 0;
-		return dsz;
-	}
-	return -ELINKCONG;
-}
-
-/**
  * tipc_forward2name - forward message sections to port name
  */
 
-int tipc_forward2name(u32 ref,
-		      struct tipc_name const *name,
-		      u32 domain,
-		      u32 num_sect,
-		      struct iovec const *msg_sect,
-		      struct tipc_portid const *orig,
-		      unsigned int importance)
+static int tipc_forward2name(u32 ref,
+			     struct tipc_name const *name,
+			     u32 domain,
+			     u32 num_sect,
+			     struct iovec const *msg_sect,
+			     struct tipc_portid const *orig,
+			     unsigned int importance)
 {
 	struct port *p_ptr;
 	struct tipc_msg *msg;
@@ -1473,7 +1306,7 @@ int tipc_forward2name(u32 ref,
 	msg_set_destnode(msg, destnode);
 	msg_set_destport(msg, destport);
 
-	if (likely(destport || destnode)) {
+	if (likely(destport)) {
 		p_ptr->sent++;
 		if (likely(destnode == tipc_own_addr))
 			return tipc_port_recv_sections(p_ptr, num_sect, msg_sect);
@@ -1510,89 +1343,15 @@ int tipc_send2name(u32 ref,
 }
 
 /**
- * tipc_forward_buf2name - forward message buffer to port name
- */
-
-int tipc_forward_buf2name(u32 ref,
-			  struct tipc_name const *name,
-			  u32 domain,
-			  struct sk_buff *buf,
-			  unsigned int dsz,
-			  struct tipc_portid const *orig,
-			  unsigned int importance)
-{
-	struct port *p_ptr;
-	struct tipc_msg *msg;
-	u32 destnode = domain;
-	u32 destport;
-	int res;
-
-	p_ptr = (struct port *)tipc_ref_deref(ref);
-	if (!p_ptr || p_ptr->publ.connected)
-		return -EINVAL;
-
-	msg = &p_ptr->publ.phdr;
-	if (importance <= TIPC_CRITICAL_IMPORTANCE)
-		msg_set_importance(msg, importance);
-	msg_set_type(msg, TIPC_NAMED_MSG);
-	msg_set_orignode(msg, orig->node);
-	msg_set_origport(msg, orig->ref);
-	msg_set_nametype(msg, name->type);
-	msg_set_nameinst(msg, name->instance);
-	msg_set_lookup_scope(msg, tipc_addr_scope(domain));
-	msg_set_hdr_sz(msg, LONG_H_SIZE);
-	msg_set_size(msg, LONG_H_SIZE + dsz);
-	destport = tipc_nametbl_translate(name->type, name->instance, &destnode);
-	msg_set_destnode(msg, destnode);
-	msg_set_destport(msg, destport);
-	msg_dbg(msg, "forw2name ==> ");
-	if (skb_cow(buf, LONG_H_SIZE))
-		return -ENOMEM;
-	skb_push(buf, LONG_H_SIZE);
-	skb_copy_to_linear_data(buf, msg, LONG_H_SIZE);
-	msg_dbg(buf_msg(buf),"PREP:");
-	if (likely(destport || destnode)) {
-		p_ptr->sent++;
-		if (destnode == tipc_own_addr)
-			return tipc_port_recv_msg(buf);
-		res = tipc_send_buf_fast(buf, destnode);
-		if (likely(res != -ELINKCONG))
-			return res;
-		if (port_unreliable(p_ptr))
-			return dsz;
-		return -ELINKCONG;
-	}
-	return tipc_reject_msg(buf, TIPC_ERR_NO_NAME);
-}
-
-/**
- * tipc_send_buf2name - send message buffer to port name
- */
-
-int tipc_send_buf2name(u32 ref,
-		       struct tipc_name const *dest,
-		       u32 domain,
-		       struct sk_buff *buf,
-		       unsigned int dsz)
-{
-	struct tipc_portid orig;
-
-	orig.ref = ref;
-	orig.node = tipc_own_addr;
-	return tipc_forward_buf2name(ref, dest, domain, buf, dsz, &orig,
-				     TIPC_PORT_IMPORTANCE);
-}
-
-/**
  * tipc_forward2port - forward message sections to port identity
  */
 
-int tipc_forward2port(u32 ref,
-		      struct tipc_portid const *dest,
-		      unsigned int num_sect,
-		      struct iovec const *msg_sect,
-		      struct tipc_portid const *orig,
-		      unsigned int importance)
+static int tipc_forward2port(u32 ref,
+			     struct tipc_portid const *dest,
+			     unsigned int num_sect,
+			     struct iovec const *msg_sect,
+			     struct tipc_portid const *orig,
+			     unsigned int importance)
 {
 	struct port *p_ptr;
 	struct tipc_msg *msg;
@@ -1644,12 +1403,12 @@ int tipc_send2port(u32 ref,
 /**
  * tipc_forward_buf2port - forward message buffer to port identity
  */
-int tipc_forward_buf2port(u32 ref,
-			  struct tipc_portid const *dest,
-			  struct sk_buff *buf,
-			  unsigned int dsz,
-			  struct tipc_portid const *orig,
-			  unsigned int importance)
+static int tipc_forward_buf2port(u32 ref,
+				 struct tipc_portid const *dest,
+				 struct sk_buff *buf,
+				 unsigned int dsz,
+				 struct tipc_portid const *orig,
+				 unsigned int importance)
 {
 	struct port *p_ptr;
 	struct tipc_msg *msg;
diff --git a/net/tipc/port.h b/net/tipc/port.h
index 8d1652aab29..73bbf442b34 100644
--- a/net/tipc/port.h
+++ b/net/tipc/port.h
@@ -109,8 +109,6 @@ struct port {
 extern spinlock_t tipc_port_list_lock;
 struct port_list;
 
-int tipc_port_recv_sections(struct port *p_ptr, u32 num_sect,
-			    struct iovec const *msg_sect);
 int tipc_port_reject_sections(struct port *p_ptr, struct tipc_msg *hdr,
 			      struct iovec const *msg_sect, u32 num_sect,
 			      int err);
@@ -157,7 +155,7 @@ static inline u32 tipc_peer_node(struct port *p_ptr)
 
 static inline int tipc_port_congested(struct port *p_ptr)
 {
-	return((p_ptr->sent - p_ptr->acked) >= (TIPC_FLOW_CONTROL_WIN * 2));
+	return (p_ptr->sent - p_ptr->acked) >= (TIPC_FLOW_CONTROL_WIN * 2);
 }
 
 /**
diff --git a/net/tipc/ref.c b/net/tipc/ref.c
index 8dea66500cf..ab8ad32d8c2 100644
--- a/net/tipc/ref.c
+++ b/net/tipc/ref.c
@@ -282,23 +282,6 @@ void *tipc_ref_lock(u32 ref)
 	return NULL;
 }
 
-/**
- * tipc_ref_unlock - unlock referenced object
- */
-
-void tipc_ref_unlock(u32 ref)
-{
-	if (likely(tipc_ref_table.entries)) {
-		struct reference *entry;
-
-		entry = &tipc_ref_table.entries[ref &
-						tipc_ref_table.index_mask];
-		if (likely((entry->ref == ref) && (entry->object)))
-			spin_unlock_bh(&entry->lock);
-		else
-			err("Attempt to unlock non-existent reference\n");
-	}
-}
 
 /**
  * tipc_ref_deref - return pointer referenced object (without locking it)
diff --git a/net/tipc/ref.h b/net/tipc/ref.h
index 7e3798ea93b..5bc8e7ab84d 100644
--- a/net/tipc/ref.h
+++ b/net/tipc/ref.h
@@ -44,7 +44,6 @@ u32 tipc_ref_acquire(void *object, spinlock_t **lock);
 void tipc_ref_discard(u32 ref);
 
 void *tipc_ref_lock(u32 ref);
-void tipc_ref_unlock(u32 ref);
 void *tipc_ref_deref(u32 ref);
 
 #endif
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 66e889ba48f..33217fc3d69 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -64,6 +64,7 @@ struct tipc_sock {
 	struct sock sk;
 	struct tipc_port *p;
 	struct tipc_portid peer_name;
+	long conn_timeout;
 };
 
 #define tipc_sk(sk) ((struct tipc_sock *)(sk))
@@ -240,9 +241,9 @@ static int tipc_create(struct net *net, struct socket *sock, int protocol,
 	sock->state = state;
 
 	sock_init_data(sock, sk);
-	sk->sk_rcvtimeo = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT);
 	sk->sk_backlog_rcv = backlog_rcv;
 	tipc_sk(sk)->p = tp_ptr;
+	tipc_sk(sk)->conn_timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT);
 
 	spin_unlock_bh(tp_ptr->lock);
 
@@ -429,36 +430,55 @@ static int get_name(struct socket *sock, struct sockaddr *uaddr,
  * to handle any preventable race conditions, so TIPC will do the same ...
  *
  * TIPC sets the returned events as follows:
- * a) POLLRDNORM and POLLIN are set if the socket's receive queue is non-empty
- *    or if a connection-oriented socket is does not have an active connection
- *    (i.e. a read operation will not block).
- * b) POLLOUT is set except when a socket's connection has been terminated
- *    (i.e. a write operation will not block).
- * c) POLLHUP is set when a socket's connection has been terminated.
- *
- * IMPORTANT: The fact that a read or write operation will not block does NOT
- * imply that the operation will succeed!
+ *
+ * socket state		flags set
+ * ------------		---------
+ * unconnected		no read flags
+ *			no write flags
+ *
+ * connecting		POLLIN/POLLRDNORM if ACK/NACK in rx queue
+ *			no write flags
+ *
+ * connected		POLLIN/POLLRDNORM if data in rx queue
+ *			POLLOUT if port is not congested
+ *
+ * disconnecting	POLLIN/POLLRDNORM/POLLHUP
+ *			no write flags
+ *
+ * listening		POLLIN if SYN in rx queue
+ *			no write flags
+ *
+ * ready		POLLIN/POLLRDNORM if data in rx queue
+ * [connectionless]	POLLOUT (since port cannot be congested)
+ *
+ * IMPORTANT: The fact that a read or write operation is indicated does NOT
+ * imply that the operation will succeed, merely that it should be performed
+ * and will not block.
  */
 
 static unsigned int poll(struct file *file, struct socket *sock,
 			 poll_table *wait)
 {
 	struct sock *sk = sock->sk;
-	u32 mask;
+	u32 mask = 0;
 
 	poll_wait(file, sk_sleep(sk), wait);
 
-	if (!skb_queue_empty(&sk->sk_receive_queue) ||
-	    (sock->state == SS_UNCONNECTED) ||
-	    (sock->state == SS_DISCONNECTING))
-		mask = (POLLRDNORM | POLLIN);
-	else
-		mask = 0;
-
-	if (sock->state == SS_DISCONNECTING)
-		mask |= POLLHUP;
-	else
-		mask |= POLLOUT;
+	switch ((int)sock->state) {
+	case SS_READY:
+	case SS_CONNECTED:
+		if (!tipc_sk_port(sk)->congested)
+			mask |= POLLOUT;
+		/* fall thru' */
+	case SS_CONNECTING:
+	case SS_LISTENING:
+		if (!skb_queue_empty(&sk->sk_receive_queue))
+			mask |= (POLLIN | POLLRDNORM);
+		break;
+	case SS_DISCONNECTING:
+		mask = (POLLIN | POLLRDNORM | POLLHUP);
+		break;
+	}
 
 	return mask;
 }
@@ -1026,9 +1046,8 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock,
 	struct sk_buff *buf;
 	struct tipc_msg *msg;
 	unsigned int sz;
-	int sz_to_copy;
+	int sz_to_copy, target, needed;
 	int sz_copied = 0;
-	int needed;
 	char __user *crs = m->msg_iov->iov_base;
 	unsigned char *buf_crs;
 	u32 err;
@@ -1050,6 +1069,8 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock,
 		goto exit;
 	}
 
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, buf_len);
+
 restart:
 
 	/* Look for a message in receive queue; wait if necessary */
@@ -1138,7 +1159,7 @@ restart:
 
 	if ((sz_copied < buf_len) &&	/* didn't get all requested data */
 	    (!skb_queue_empty(&sk->sk_receive_queue) ||
-	     (flags & MSG_WAITALL)) &&	/* and more is ready or required */
+	    (sz_copied < target)) &&	/* and more is ready or required */
 	    (!(flags & MSG_PEEK)) &&	/* and aren't just peeking at data */
 	    (!err))			/* and haven't reached a FIN */
 		goto restart;
@@ -1174,7 +1195,7 @@ static int rx_queue_full(struct tipc_msg *msg, u32 queue_size, u32 base)
 	if (msg_connected(msg))
 		threshold *= 4;
 
-	return (queue_size >= threshold);
+	return queue_size >= threshold;
 }
 
 /**
@@ -1365,6 +1386,7 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen,
 	struct msghdr m = {NULL,};
 	struct sk_buff *buf;
 	struct tipc_msg *msg;
+	long timeout;
 	int res;
 
 	lock_sock(sk);
@@ -1379,7 +1401,7 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen,
 	/* For now, TIPC does not support the non-blocking form of connect() */
 
 	if (flags & O_NONBLOCK) {
-		res = -EWOULDBLOCK;
+		res = -EOPNOTSUPP;
 		goto exit;
 	}
 
@@ -1425,11 +1447,12 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen,
 
 	/* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */
 
+	timeout = tipc_sk(sk)->conn_timeout;
 	release_sock(sk);
 	res = wait_event_interruptible_timeout(*sk_sleep(sk),
 			(!skb_queue_empty(&sk->sk_receive_queue) ||
 			(sock->state != SS_CONNECTING)),
-			sk->sk_rcvtimeo);
+			timeout ? timeout : MAX_SCHEDULE_TIMEOUT);
 	lock_sock(sk);
 
 	if (res > 0) {
@@ -1692,7 +1715,7 @@ static int setsockopt(struct socket *sock,
 		res = tipc_set_portunreturnable(tport->ref, value);
 		break;
 	case TIPC_CONN_TIMEOUT:
-		sk->sk_rcvtimeo = msecs_to_jiffies(value);
+		tipc_sk(sk)->conn_timeout = msecs_to_jiffies(value);
 		/* no need to set "res", since already 0 at this point */
 		break;
 	default:
@@ -1747,7 +1770,7 @@ static int getsockopt(struct socket *sock,
 		res = tipc_portunreturnable(tport->ref, &value);
 		break;
 	case TIPC_CONN_TIMEOUT:
-		value = jiffies_to_msecs(sk->sk_rcvtimeo);
+		value = jiffies_to_msecs(tipc_sk(sk)->conn_timeout);
 		/* no need to set "res", since already 0 at this point */
 		break;
 	 case TIPC_NODE_RECVQ_DEPTH:
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index ab6eab4c45e..33313961d01 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -76,6 +76,19 @@ struct top_srv {
 static struct top_srv topsrv = { 0 };
 
 /**
+ * htohl - convert value to endianness used by destination
+ * @in: value to convert
+ * @swap: non-zero if endianness must be reversed
+ *
+ * Returns converted value
+ */
+
+static u32 htohl(u32 in, int swap)
+{
+	return swap ? swab32(in) : in;
+}
+
+/**
  * subscr_send_event - send a message containing a tipc_event to the subscriber
  *
  * Note: Must not hold subscriber's server port lock, since tipc_send() will
@@ -94,11 +107,11 @@ static void subscr_send_event(struct subscription *sub,
 	msg_sect.iov_base = (void *)&sub->evt;
 	msg_sect.iov_len = sizeof(struct tipc_event);
 
-	sub->evt.event = htonl(event);
-	sub->evt.found_lower = htonl(found_lower);
-	sub->evt.found_upper = htonl(found_upper);
-	sub->evt.port.ref = htonl(port_ref);
-	sub->evt.port.node = htonl(node);
+	sub->evt.event = htohl(event, sub->swap);
+	sub->evt.found_lower = htohl(found_lower, sub->swap);
+	sub->evt.found_upper = htohl(found_upper, sub->swap);
+	sub->evt.port.ref = htohl(port_ref, sub->swap);
+	sub->evt.port.node = htohl(node, sub->swap);
 	tipc_send(sub->server_ref, 1, &msg_sect);
 }
 
@@ -274,29 +287,16 @@ static void subscr_cancel(struct tipc_subscr *s,
 {
 	struct subscription *sub;
 	struct subscription *sub_temp;
-	__u32 type, lower, upper, timeout, filter;
 	int found = 0;
 
 	/* Find first matching subscription, exit if not found */
 
-	type = ntohl(s->seq.type);
-	lower = ntohl(s->seq.lower);
-	upper = ntohl(s->seq.upper);
-	timeout = ntohl(s->timeout);
-	filter = ntohl(s->filter) & ~TIPC_SUB_CANCEL;
-
 	list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list,
 				 subscription_list) {
-			if ((type == sub->seq.type) &&
-			    (lower == sub->seq.lower) &&
-			    (upper == sub->seq.upper) &&
-			    (timeout == sub->timeout) &&
-                            (filter == sub->filter) &&
-                             !memcmp(s->usr_handle,sub->evt.s.usr_handle,
-				     sizeof(s->usr_handle)) ){
-				found = 1;
-				break;
-			}
+		if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) {
+			found = 1;
+			break;
+		}
 	}
 	if (!found)
 		return;
@@ -310,7 +310,7 @@ static void subscr_cancel(struct tipc_subscr *s,
 		k_term_timer(&sub->timer);
 		spin_lock_bh(subscriber->lock);
 	}
-	dbg("Cancel: removing sub %u,%u,%u from subscriber %p list\n",
+	dbg("Cancel: removing sub %u,%u,%u from subscriber %x list\n",
 	    sub->seq.type, sub->seq.lower, sub->seq.upper, subscriber);
 	subscr_del(sub);
 }
@@ -325,10 +325,16 @@ static struct subscription *subscr_subscribe(struct tipc_subscr *s,
 					     struct subscriber *subscriber)
 {
 	struct subscription *sub;
+	int swap;
+
+	/* Determine subscriber's endianness */
+
+	swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE));
 
 	/* Detect & process a subscription cancellation request */
 
-	if (ntohl(s->filter) & TIPC_SUB_CANCEL) {
+	if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) {
+		s->filter &= ~htohl(TIPC_SUB_CANCEL, swap);
 		subscr_cancel(s, subscriber);
 		return NULL;
 	}
@@ -353,12 +359,13 @@ static struct subscription *subscr_subscribe(struct tipc_subscr *s,
 
 	/* Initialize subscription object */
 
-	sub->seq.type = ntohl(s->seq.type);
-	sub->seq.lower = ntohl(s->seq.lower);
-	sub->seq.upper = ntohl(s->seq.upper);
-	sub->timeout = ntohl(s->timeout);
-	sub->filter = ntohl(s->filter);
-	if ((sub->filter && (sub->filter != TIPC_SUB_PORTS)) ||
+	sub->seq.type = htohl(s->seq.type, swap);
+	sub->seq.lower = htohl(s->seq.lower, swap);
+	sub->seq.upper = htohl(s->seq.upper, swap);
+	sub->timeout = htohl(s->timeout, swap);
+	sub->filter = htohl(s->filter, swap);
+	if ((!(sub->filter & TIPC_SUB_PORTS) ==
+	     !(sub->filter & TIPC_SUB_SERVICE)) ||
 	    (sub->seq.lower > sub->seq.upper)) {
 		warn("Subscription rejected, illegal request\n");
 		kfree(sub);
@@ -369,6 +376,7 @@ static struct subscription *subscr_subscribe(struct tipc_subscr *s,
 	INIT_LIST_HEAD(&sub->nameseq_list);
 	list_add(&sub->subscription_list, &subscriber->subscription_list);
 	sub->server_ref = subscriber->port_ref;
+	sub->swap = swap;
 	memcpy(&sub->evt.s, s, sizeof(struct tipc_subscr));
 	atomic_inc(&topsrv.subscription_count);
 	if (sub->timeout != TIPC_WAIT_FOREVER) {
@@ -598,12 +606,3 @@ void tipc_subscr_stop(void)
 		topsrv.user_ref = 0;
 	}
 }
-
-
-int tipc_ispublished(struct tipc_name const *name)
-{
-	u32 domain = 0;
-
-	return(tipc_nametbl_translate(name->type, name->instance,&domain) != 0);
-}
-
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index c20f496d95b..45d89bf4d20 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -53,6 +53,7 @@ typedef void (*tipc_subscr_event) (struct subscription *sub,
  * @nameseq_list: adjacent subscriptions in name sequence's subscription list
  * @subscription_list: adjacent subscriptions in subscriber's subscription list
  * @server_ref: object reference of server port associated with subscription
+ * @swap: indicates if subscriber uses opposite endianness in its messages
  * @evt: template for events generated by subscription
  */
 
@@ -65,6 +66,7 @@ struct subscription {
 	struct list_head nameseq_list;
 	struct list_head subscription_list;
 	u32 server_ref;
+	int swap;
 	struct tipc_event evt;
 };
 
diff --git a/net/tipc/zone.c b/net/tipc/zone.c
index 2c01ba2d86b..83f8b5e91fc 100644
--- a/net/tipc/zone.c
+++ b/net/tipc/zone.c
@@ -160,14 +160,3 @@ u32 tipc_zone_select_router(struct _zone *z_ptr, u32 addr, u32 ref)
 	}
 	return 0;
 }
-
-
-u32 tipc_zone_next_node(u32 addr)
-{
-	struct cluster *c_ptr = tipc_cltr_find(addr);
-
-	if (c_ptr)
-		return tipc_cltr_next_node(c_ptr, addr);
-	return 0;
-}
-
diff --git a/net/tipc/zone.h b/net/tipc/zone.h
index 7bdc3406ba9..bd1c20ce9d0 100644
--- a/net/tipc/zone.h
+++ b/net/tipc/zone.h
@@ -61,7 +61,6 @@ void tipc_zone_send_external_routes(struct _zone *z_ptr, u32 dest);
 struct _zone *tipc_zone_create(u32 addr);
 void tipc_zone_delete(struct _zone *z_ptr);
 void tipc_zone_attach_cluster(struct _zone *z_ptr, struct cluster *c_ptr);
-u32 tipc_zone_next_node(u32 addr);
 
 static inline struct _zone *tipc_zone_find(u32 addr)
 {
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index fef2cc5e9d2..3c95304a081 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -117,7 +117,7 @@
 
 static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
 static DEFINE_SPINLOCK(unix_table_lock);
-static atomic_t unix_nr_socks = ATOMIC_INIT(0);
+static atomic_long_t unix_nr_socks;
 
 #define unix_sockets_unbound	(&unix_socket_table[UNIX_HASH_SIZE])
 
@@ -282,7 +282,7 @@ static inline struct sock *unix_find_socket_byname(struct net *net,
 	return s;
 }
 
-static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i)
+static struct sock *unix_find_socket_byinode(struct inode *i)
 {
 	struct sock *s;
 	struct hlist_node *node;
@@ -292,9 +292,6 @@ static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i)
 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 		struct dentry *dentry = unix_sk(s)->dentry;
 
-		if (!net_eq(sock_net(s), net))
-			continue;
-
 		if (dentry && dentry->d_inode == i) {
 			sock_hold(s);
 			goto found;
@@ -363,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk)
 	if (u->addr)
 		unix_release_addr(u->addr);
 
-	atomic_dec(&unix_nr_socks);
+	atomic_long_dec(&unix_nr_socks);
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	local_bh_enable();
 #ifdef UNIX_REFCNT_DEBUG
-	printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk,
-		atomic_read(&unix_nr_socks));
+	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
+		atomic_long_read(&unix_nr_socks));
 #endif
 }
 
@@ -450,11 +447,31 @@ static int unix_release_sock(struct sock *sk, int embrion)
 	return 0;
 }
 
+static void init_peercred(struct sock *sk)
+{
+	put_pid(sk->sk_peer_pid);
+	if (sk->sk_peer_cred)
+		put_cred(sk->sk_peer_cred);
+	sk->sk_peer_pid  = get_pid(task_tgid(current));
+	sk->sk_peer_cred = get_current_cred();
+}
+
+static void copy_peercred(struct sock *sk, struct sock *peersk)
+{
+	put_pid(sk->sk_peer_pid);
+	if (sk->sk_peer_cred)
+		put_cred(sk->sk_peer_cred);
+	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
+	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
+}
+
 static int unix_listen(struct socket *sock, int backlog)
 {
 	int err;
 	struct sock *sk = sock->sk;
 	struct unix_sock *u = unix_sk(sk);
+	struct pid *old_pid = NULL;
+	const struct cred *old_cred = NULL;
 
 	err = -EOPNOTSUPP;
 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
@@ -470,12 +487,14 @@ static int unix_listen(struct socket *sock, int backlog)
 	sk->sk_max_ack_backlog	= backlog;
 	sk->sk_state		= TCP_LISTEN;
 	/* set credentials so connect can copy them */
-	sk->sk_peercred.pid	= task_tgid_vnr(current);
-	current_euid_egid(&sk->sk_peercred.uid, &sk->sk_peercred.gid);
+	init_peercred(sk);
 	err = 0;
 
 out_unlock:
 	unix_state_unlock(sk);
+	put_pid(old_pid);
+	if (old_cred)
+		put_cred(old_cred);
 out:
 	return err;
 }
@@ -587,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	struct sock *sk = NULL;
 	struct unix_sock *u;
 
-	atomic_inc(&unix_nr_socks);
-	if (atomic_read(&unix_nr_socks) > 2 * get_max_files())
+	atomic_long_inc(&unix_nr_socks);
+	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 		goto out;
 
 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
@@ -613,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	unix_insert_socket(unix_sockets_unbound, sk);
 out:
 	if (sk == NULL)
-		atomic_dec(&unix_nr_socks);
+		atomic_long_dec(&unix_nr_socks);
 	else {
 		local_bh_disable();
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -673,6 +692,7 @@ static int unix_autobind(struct socket *sock)
 	static u32 ordernum = 1;
 	struct unix_address *addr;
 	int err;
+	unsigned int retries = 0;
 
 	mutex_lock(&u->readlock);
 
@@ -698,9 +718,17 @@ retry:
 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 				      addr->hash)) {
 		spin_unlock(&unix_table_lock);
-		/* Sanity yield. It is unusual case, but yet... */
-		if (!(ordernum&0xFF))
-			yield();
+		/*
+		 * __unix_find_socket_byname() may take long time if many names
+		 * are already in use.
+		 */
+		cond_resched();
+		/* Give up if all names seems to be in use. */
+		if (retries++ == 0xFFFFF) {
+			err = -ENOSPC;
+			kfree(addr);
+			goto out;
+		}
 		goto retry;
 	}
 	addr->hash ^= sk->sk_type;
@@ -736,7 +764,7 @@ static struct sock *unix_find_other(struct net *net,
 		err = -ECONNREFUSED;
 		if (!S_ISSOCK(inode->i_mode))
 			goto put_fail;
-		u = unix_find_socket_byinode(net, inode);
+		u = unix_find_socket_byinode(inode);
 		if (!u)
 			goto put_fail;
 
@@ -1140,8 +1168,7 @@ restart:
 	unix_peer(newsk)	= sk;
 	newsk->sk_state		= TCP_ESTABLISHED;
 	newsk->sk_type		= sk->sk_type;
-	newsk->sk_peercred.pid	= task_tgid_vnr(current);
-	current_euid_egid(&newsk->sk_peercred.uid, &newsk->sk_peercred.gid);
+	init_peercred(newsk);
 	newu = unix_sk(newsk);
 	newsk->sk_wq		= &newu->peer_wq;
 	otheru = unix_sk(other);
@@ -1157,7 +1184,7 @@ restart:
 	}
 
 	/* Set credentials */
-	sk->sk_peercred = other->sk_peercred;
+	copy_peercred(sk, other);
 
 	sock->state	= SS_CONNECTED;
 	sk->sk_state	= TCP_ESTABLISHED;
@@ -1199,10 +1226,8 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb)
 	sock_hold(skb);
 	unix_peer(ska) = skb;
 	unix_peer(skb) = ska;
-	ska->sk_peercred.pid = skb->sk_peercred.pid = task_tgid_vnr(current);
-	current_euid_egid(&skb->sk_peercred.uid, &skb->sk_peercred.gid);
-	ska->sk_peercred.uid = skb->sk_peercred.uid;
-	ska->sk_peercred.gid = skb->sk_peercred.gid;
+	init_peercred(ska);
+	init_peercred(skb);
 
 	if (ska->sk_type != SOCK_DGRAM) {
 		ska->sk_state = TCP_ESTABLISHED;
@@ -1297,18 +1322,20 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 	int i;
 
 	scm->fp = UNIXCB(skb).fp;
-	skb->destructor = sock_wfree;
 	UNIXCB(skb).fp = NULL;
 
 	for (i = scm->fp->count-1; i >= 0; i--)
 		unix_notinflight(scm->fp->fp[i]);
 }
 
-static void unix_destruct_fds(struct sk_buff *skb)
+static void unix_destruct_scm(struct sk_buff *skb)
 {
 	struct scm_cookie scm;
 	memset(&scm, 0, sizeof(scm));
-	unix_detach_fds(&scm, skb);
+	scm.pid  = UNIXCB(skb).pid;
+	scm.cred = UNIXCB(skb).cred;
+	if (UNIXCB(skb).fp)
+		unix_detach_fds(&scm, skb);
 
 	/* Alas, it calls VFS */
 	/* So fscking what? fput() had been SMP-safe since the last Summer */
@@ -1331,10 +1358,22 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 
 	for (i = scm->fp->count-1; i >= 0; i--)
 		unix_inflight(scm->fp->fp[i]);
-	skb->destructor = unix_destruct_fds;
 	return 0;
 }
 
+static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
+{
+	int err = 0;
+	UNIXCB(skb).pid  = get_pid(scm->pid);
+	UNIXCB(skb).cred = get_cred(scm->cred);
+	UNIXCB(skb).fp = NULL;
+	if (scm->fp && send_fds)
+		err = unix_attach_fds(scm, skb);
+
+	skb->destructor = unix_destruct_scm;
+	return err;
+}
+
 /*
  *	Send AF_UNIX data.
  */
@@ -1391,12 +1430,9 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	if (skb == NULL)
 		goto out;
 
-	memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
-	if (siocb->scm->fp) {
-		err = unix_attach_fds(siocb->scm, skb);
-		if (err)
-			goto out_free;
-	}
+	err = unix_scm_to_skb(siocb->scm, skb, true);
+	if (err)
+		goto out_free;
 	unix_get_secdata(siocb->scm, skb);
 
 	skb_reset_transport_header(skb);
@@ -1475,6 +1511,8 @@ restart:
 		goto restart;
 	}
 
+	if (sock_flag(other, SOCK_RCVTSTAMP))
+		__net_timestamp(skb);
 	skb_queue_tail(&other->sk_receive_queue, skb);
 	unix_state_unlock(other);
 	other->sk_data_ready(other, len);
@@ -1566,16 +1604,14 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		 */
 		size = min_t(int, size, skb_tailroom(skb));
 
-		memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
+
 		/* Only send the fds in the first buffer */
-		if (siocb->scm->fp && !fds_sent) {
-			err = unix_attach_fds(siocb->scm, skb);
-			if (err) {
-				kfree_skb(skb);
-				goto out_err;
-			}
-			fds_sent = true;
+		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
+		if (err) {
+			kfree_skb(skb);
+			goto out_err;
 		}
+		fds_sent = true;
 
 		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
 		if (err) {
@@ -1688,11 +1724,14 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 	if (err)
 		goto out_free;
 
+	if (sock_flag(sk, SOCK_RCVTSTAMP))
+		__sock_recv_timestamp(msg, sk, skb);
+
 	if (!siocb->scm) {
 		siocb->scm = &tmp_scm;
 		memset(&tmp_scm, 0, sizeof(tmp_scm));
 	}
-	siocb->scm->creds = *UNIXCREDS(skb);
+	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
 	unix_set_secdata(siocb->scm, skb);
 
 	if (!(flags & MSG_PEEK)) {
@@ -1841,14 +1880,14 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 		if (check_creds) {
 			/* Never glue messages from different writers */
-			if (memcmp(UNIXCREDS(skb), &siocb->scm->creds,
-				   sizeof(siocb->scm->creds)) != 0) {
+			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
+			    (UNIXCB(skb).cred != siocb->scm->cred)) {
 				skb_queue_head(&sk->sk_receive_queue, skb);
 				break;
 			}
 		} else {
 			/* Copy credentials */
-			siocb->scm->creds = *UNIXCREDS(skb);
+			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
 			check_creds = 1;
 		}
 
@@ -1881,7 +1920,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 				break;
 			}
 
-			kfree_skb(skb);
+			consume_skb(skb);
 
 			if (siocb->scm->fp)
 				break;
@@ -1999,11 +2038,10 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
 		mask |= POLLHUP;
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLRDHUP;
+		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
 
 	/* readable? */
-	if (!skb_queue_empty(&sk->sk_receive_queue) ||
-	    (sk->sk_shutdown & RCV_SHUTDOWN))
+	if (!skb_queue_empty(&sk->sk_receive_queue))
 		mask |= POLLIN | POLLRDNORM;
 
 	/* Connection-based need to check for termination and startup */
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index 258daa80ad9..74944a2dd43 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -48,7 +48,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>	/* support for loadable modules */
 #include <linux/slab.h>		/* kmalloc(), kfree() */
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 #include <linux/mm.h>
 #include <linux/string.h>	/* inline mem*, str* functions */
 
@@ -71,6 +71,7 @@
  *	WAN device IOCTL handlers
  */
 
+static DEFINE_MUTEX(wanrouter_mutex);
 static int wanrouter_device_setup(struct wan_device *wandev,
 				  wandev_conf_t __user *u_conf);
 static int wanrouter_device_stat(struct wan_device *wandev,
@@ -376,7 +377,7 @@ long wanrouter_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	if (wandev->magic != ROUTER_MAGIC)
 		return -EINVAL;
 
-	lock_kernel();
+	mutex_lock(&wanrouter_mutex);
 	switch (cmd) {
 	case ROUTER_SETUP:
 		err = wanrouter_device_setup(wandev, data);
@@ -408,7 +409,7 @@ long wanrouter_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			err = wandev->ioctl(wandev, cmd, arg);
 		else err = -EINVAL;
 	}
-	unlock_kernel();
+	mutex_unlock(&wanrouter_mutex);
 	return err;
 }
 
@@ -470,7 +471,7 @@ static int wanrouter_device_setup(struct wan_device *wandev,
 		data = vmalloc(conf->data_size);
 		if (!data) {
 			printk(KERN_INFO
-				"%s: ERROR, Faild allocate kernel memory !\n",
+				"%s: ERROR, Failed allocate kernel memory !\n",
 				wandev->name);
 			kfree(conf);
 			return -ENOBUFS;
@@ -480,7 +481,7 @@ static int wanrouter_device_setup(struct wan_device *wandev,
 			err = wandev->setup(wandev, conf);
 		} else {
 			printk(KERN_INFO
-			     "%s: ERROR, Faild to copy from user data !\n",
+			     "%s: ERROR, Failed to copy from user data !\n",
 			       wandev->name);
 			err = -EFAULT;
 		}
diff --git a/net/wanrouter/wanproc.c b/net/wanrouter/wanproc.c
index c44d96b3a43..11f25c7a7a0 100644
--- a/net/wanrouter/wanproc.c
+++ b/net/wanrouter/wanproc.c
@@ -27,7 +27,7 @@
 #include <linux/module.h>
 #include <linux/wanrouter.h>	/* WAN router API definitions */
 #include <linux/seq_file.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
 
 #include <net/net_namespace.h>
 #include <asm/io.h>
@@ -66,6 +66,7 @@
  *	/proc/net/router
  */
 
+static DEFINE_MUTEX(config_mutex);
 static struct proc_dir_entry *proc_router;
 
 /* Strings */
@@ -85,7 +86,7 @@ static void *r_start(struct seq_file *m, loff_t *pos)
 	struct wan_device *wandev;
 	loff_t l = *pos;
 
-	lock_kernel();
+	mutex_lock(&config_mutex);
 	if (!l--)
 		return SEQ_START_TOKEN;
 	for (wandev = wanrouter_router_devlist; l-- && wandev;
@@ -104,7 +105,7 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
 static void r_stop(struct seq_file *m, void *v)
 	__releases(kernel_lock)
 {
-	unlock_kernel();
+	mutex_unlock(&config_mutex);
 }
 
 static int config_show(struct seq_file *m, void *v)
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index b01a6f6397d..d0c92dddb26 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -35,8 +35,9 @@ rdev_freq_to_chan(struct cfg80211_registered_device *rdev,
 		if (!ht_cap->ht_supported)
 			return NULL;
 
-		if (!(ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) ||
-		    ht_cap->cap & IEEE80211_HT_CAP_40MHZ_INTOLERANT)
+		if (channel_type != NL80211_CHAN_HT20 &&
+		    (!(ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) ||
+		    ht_cap->cap & IEEE80211_HT_CAP_40MHZ_INTOLERANT))
 			return NULL;
 	}
 
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 37d0e0ab443..9c21ebf9780 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -253,11 +253,16 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
 			WARN_ON(err);
 			wdev->netdev->features |= NETIF_F_NETNS_LOCAL;
 		}
+
+		return err;
 	}
 
 	wiphy_net_set(&rdev->wiphy, net);
 
-	return err;
+	err = device_rename(&rdev->wiphy.dev, dev_name(&rdev->wiphy.dev));
+	WARN_ON(err);
+
+	return 0;
 }
 
 static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data)
@@ -428,7 +433,7 @@ int wiphy_register(struct wiphy *wiphy)
 
 	/* sanity check ifmodes */
 	WARN_ON(!ifmodes);
-	ifmodes &= ((1 << __NL80211_IFTYPE_AFTER_LAST) - 1) & ~1;
+	ifmodes &= ((1 << NUM_NL80211_IFTYPES) - 1) & ~1;
 	if (WARN_ON(ifmodes != wiphy->interface_modes))
 		wiphy->interface_modes = ifmodes;
 
@@ -472,15 +477,13 @@ int wiphy_register(struct wiphy *wiphy)
 	/* check and set up bitrates */
 	ieee80211_set_bitrate_flags(wiphy);
 
+	mutex_lock(&cfg80211_mutex);
+
 	res = device_add(&rdev->wiphy.dev);
-	if (res)
+	if (res) {
+		mutex_unlock(&cfg80211_mutex);
 		return res;
-
-	res = rfkill_register(rdev->rfkill);
-	if (res)
-		goto out_rm_dev;
-
-	mutex_lock(&cfg80211_mutex);
+	}
 
 	/* set up regulatory info */
 	wiphy_update_regulatory(wiphy, NL80211_REGDOM_SET_BY_CORE);
@@ -488,8 +491,6 @@ int wiphy_register(struct wiphy *wiphy)
 	list_add_rcu(&rdev->list, &cfg80211_rdev_list);
 	cfg80211_rdev_list_generation++;
 
-	mutex_unlock(&cfg80211_mutex);
-
 	/* add to debugfs */
 	rdev->wiphy.debugfsdir =
 		debugfs_create_dir(wiphy_name(&rdev->wiphy),
@@ -509,10 +510,19 @@ int wiphy_register(struct wiphy *wiphy)
 	}
 
 	cfg80211_debugfs_rdev_add(rdev);
+	mutex_unlock(&cfg80211_mutex);
+
+	/*
+	 * due to a locking dependency this has to be outside of the
+	 * cfg80211_mutex lock
+	 */
+	res = rfkill_register(rdev->rfkill);
+	if (res)
+		goto out_rm_dev;
 
 	return 0;
 
- out_rm_dev:
+out_rm_dev:
 	device_del(&rdev->wiphy.dev);
 	return res;
 }
@@ -678,8 +688,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb,
 		INIT_WORK(&wdev->cleanup_work, wdev_cleanup_work);
 		INIT_LIST_HEAD(&wdev->event_list);
 		spin_lock_init(&wdev->event_lock);
-		INIT_LIST_HEAD(&wdev->action_registrations);
-		spin_lock_init(&wdev->action_registrations_lock);
+		INIT_LIST_HEAD(&wdev->mgmt_registrations);
+		spin_lock_init(&wdev->mgmt_registrations_lock);
 
 		mutex_lock(&rdev->devlist_mtx);
 		list_add_rcu(&wdev->list, &rdev->netdev_list);
@@ -719,6 +729,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb,
 			dev->ethtool_ops = &cfg80211_ethtool_ops;
 
 		if ((wdev->iftype == NL80211_IFTYPE_STATION ||
+		     wdev->iftype == NL80211_IFTYPE_P2P_CLIENT ||
 		     wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr)
 			dev->priv_flags |= IFF_DONT_BRIDGE;
 		break;
@@ -727,6 +738,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb,
 		case NL80211_IFTYPE_ADHOC:
 			cfg80211_leave_ibss(rdev, dev, true);
 			break;
+		case NL80211_IFTYPE_P2P_CLIENT:
 		case NL80211_IFTYPE_STATION:
 			wdev_lock(wdev);
 #ifdef CONFIG_CFG80211_WEXT
@@ -799,7 +811,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb,
 			sysfs_remove_link(&dev->dev.kobj, "phy80211");
 			list_del_rcu(&wdev->list);
 			rdev->devlist_generation++;
-			cfg80211_mlme_purge_actions(wdev);
+			cfg80211_mlme_purge_registrations(wdev);
 #ifdef CONFIG_CFG80211_WEXT
 			kfree(wdev->wext.keys);
 #endif
@@ -894,7 +906,7 @@ out_fail_pernet:
 }
 subsys_initcall(cfg80211_init);
 
-static void cfg80211_exit(void)
+static void __exit cfg80211_exit(void)
 {
 	debugfs_remove(ieee80211_debugfs_dir);
 	nl80211_exit();
diff --git a/net/wireless/core.h b/net/wireless/core.h
index ae930acf75e..6583cca0e2e 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -86,7 +86,7 @@ struct cfg80211_registered_device *wiphy_to_dev(struct wiphy *wiphy)
 static inline
 bool wiphy_idx_valid(int wiphy_idx)
 {
-	return (wiphy_idx >= 0);
+	return wiphy_idx >= 0;
 }
 
 
@@ -95,7 +95,10 @@ extern struct mutex cfg80211_mutex;
 extern struct list_head cfg80211_rdev_list;
 extern int cfg80211_rdev_list_generation;
 
-#define assert_cfg80211_lock() WARN_ON(!mutex_is_locked(&cfg80211_mutex))
+static inline void assert_cfg80211_lock(void)
+{
+	lockdep_assert_held(&cfg80211_mutex);
+}
 
 /*
  * You can use this to mark a wiphy_idx as not having an associated wiphy.
@@ -202,8 +205,8 @@ static inline void wdev_unlock(struct wireless_dev *wdev)
 	mutex_unlock(&wdev->mtx);
 }
 
-#define ASSERT_RDEV_LOCK(rdev) WARN_ON(!mutex_is_locked(&(rdev)->mtx));
-#define ASSERT_WDEV_LOCK(wdev) WARN_ON(!mutex_is_locked(&(wdev)->mtx));
+#define ASSERT_RDEV_LOCK(rdev) lockdep_assert_held(&(rdev)->mtx)
+#define ASSERT_WDEV_LOCK(wdev) lockdep_assert_held(&(wdev)->mtx)
 
 enum cfg80211_event_type {
 	EVENT_CONNECT_RESULT,
@@ -331,15 +334,17 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 			       const u8 *resp_ie, size_t resp_ie_len,
 			       u16 status, bool wextev,
 			       struct cfg80211_bss *bss);
-int cfg80211_mlme_register_action(struct wireless_dev *wdev, u32 snd_pid,
-				  const u8 *match_data, int match_len);
-void cfg80211_mlme_unregister_actions(struct wireless_dev *wdev, u32 nlpid);
-void cfg80211_mlme_purge_actions(struct wireless_dev *wdev);
-int cfg80211_mlme_action(struct cfg80211_registered_device *rdev,
-			 struct net_device *dev,
-			 struct ieee80211_channel *chan,
-			 enum nl80211_channel_type channel_type,
-			 const u8 *buf, size_t len, u64 *cookie);
+int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
+				u16 frame_type, const u8 *match_data,
+				int match_len);
+void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid);
+void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev);
+int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
+			  struct net_device *dev,
+			  struct ieee80211_channel *chan,
+			  enum nl80211_channel_type channel_type,
+			  bool channel_type_valid,
+			  const u8 *buf, size_t len, u64 *cookie);
 
 /* SME */
 int __cfg80211_connect(struct cfg80211_registered_device *rdev,
@@ -370,7 +375,7 @@ bool cfg80211_sme_failed_reassoc(struct wireless_dev *wdev);
 /* internal helpers */
 int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
 				   struct key_params *params, int key_idx,
-				   const u8 *mac_addr);
+				   bool pairwise, const u8 *mac_addr);
 void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 			     size_t ie_len, u16 reason, bool from_ap);
 void cfg80211_sme_scan_done(struct net_device *dev);
diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c
index a4991a3efec..39765bcfb47 100644
--- a/net/wireless/debugfs.c
+++ b/net/wireless/debugfs.c
@@ -34,6 +34,7 @@ static ssize_t name## _read(struct file *file, char __user *userbuf,	\
 static const struct file_operations name## _ops = {			\
 	.read = name## _read,						\
 	.open = cfg80211_open_file_generic,				\
+	.llseek = generic_file_llseek,					\
 };
 
 DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d",
@@ -102,6 +103,7 @@ static ssize_t ht40allow_map_read(struct file *file,
 static const struct file_operations ht40allow_map_ops = {
 	.read = ht40allow_map_read,
 	.open = cfg80211_open_file_generic,
+	.llseek = default_llseek,
 };
 
 #define DEBUGFS_ADD(name)						\
diff --git a/net/wireless/genregdb.awk b/net/wireless/genregdb.awk
index 3cc9e69880a..53c143f5e77 100644
--- a/net/wireless/genregdb.awk
+++ b/net/wireless/genregdb.awk
@@ -21,6 +21,7 @@ BEGIN {
 	print ""
 	print "#include <linux/nl80211.h>"
 	print "#include <net/cfg80211.h>"
+	print "#include \"regdb.h\""
 	print ""
 	regdb = "const struct ieee80211_regdomain *reg_regdb[] = {\n"
 }
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index adcabba02e2..f33fbb79437 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -88,6 +88,25 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
 	if (wdev->ssid_len)
 		return -EALREADY;
 
+	if (!params->basic_rates) {
+		/*
+		* If no rates were explicitly configured,
+		* use the mandatory rate set for 11b or
+		* 11a for maximum compatibility.
+		*/
+		struct ieee80211_supported_band *sband =
+			rdev->wiphy.bands[params->channel->band];
+		int j;
+		u32 flag = params->channel->band == IEEE80211_BAND_5GHZ ?
+			IEEE80211_RATE_MANDATORY_A :
+			IEEE80211_RATE_MANDATORY_B;
+
+		for (j = 0; j < sband->n_bitrates; j++) {
+			if (sband->bitrates[j].flags & flag)
+				params->basic_rates |= BIT(j);
+		}
+	}
+
 	if (WARN_ON(wdev->connect_keys))
 		kfree(wdev->connect_keys);
 	wdev->connect_keys = connkeys;
@@ -141,7 +160,7 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext)
 	 */
 	if (rdev->ops->del_key)
 		for (i = 0; i < 6; i++)
-			rdev->ops->del_key(wdev->wiphy, dev, i, NULL);
+			rdev->ops->del_key(wdev->wiphy, dev, i, false, NULL);
 
 	if (wdev->current_bss) {
 		cfg80211_unhold_bss(wdev->current_bss);
@@ -247,8 +266,10 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev,
 	if (!netif_running(wdev->netdev))
 		return 0;
 
-	if (wdev->wext.keys)
+	if (wdev->wext.keys) {
 		wdev->wext.keys->def = wdev->wext.default_key;
+		wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key;
+	}
 
 	wdev->wext.ibss.privacy = wdev->wext.default_key != -1;
 
diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c
index b7fa31d5fd1..dacb3b4b1bd 100644
--- a/net/wireless/lib80211_crypt_ccmp.c
+++ b/net/wireless/lib80211_crypt_ccmp.c
@@ -467,7 +467,6 @@ static struct lib80211_crypto_ops lib80211_crypt_ccmp = {
 	.name = "CCMP",
 	.init = lib80211_ccmp_init,
 	.deinit = lib80211_ccmp_deinit,
-	.build_iv = lib80211_ccmp_hdr,
 	.encrypt_mpdu = lib80211_ccmp_encrypt,
 	.decrypt_mpdu = lib80211_ccmp_decrypt,
 	.encrypt_msdu = NULL,
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index 8cbdb32ff31..0fe40510e2c 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -578,7 +578,7 @@ static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr)
 	}
 
 	if (ieee80211_is_data_qos(hdr11->frame_control)) {
-		hdr[12] = le16_to_cpu(*ieee80211_get_qos_ctl(hdr11))
+		hdr[12] = le16_to_cpu(*((__le16 *)ieee80211_get_qos_ctl(hdr11)))
 			& IEEE80211_QOS_CTL_TID_MASK;
 	} else
 		hdr[12] = 0;		/* priority */
@@ -757,7 +757,6 @@ static struct lib80211_crypto_ops lib80211_crypt_tkip = {
 	.name = "TKIP",
 	.init = lib80211_tkip_init,
 	.deinit = lib80211_tkip_deinit,
-	.build_iv = lib80211_tkip_hdr,
 	.encrypt_mpdu = lib80211_tkip_encrypt,
 	.decrypt_mpdu = lib80211_tkip_decrypt,
 	.encrypt_msdu = lib80211_michael_mic_add,
diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c
index 6d41e05ca33..e2e88878ba3 100644
--- a/net/wireless/lib80211_crypt_wep.c
+++ b/net/wireless/lib80211_crypt_wep.c
@@ -269,7 +269,6 @@ static struct lib80211_crypto_ops lib80211_crypt_wep = {
 	.name = "WEP",
 	.init = lib80211_wep_init,
 	.deinit = lib80211_wep_deinit,
-	.build_iv = lib80211_wep_build_iv,
 	.encrypt_mpdu = lib80211_wep_encrypt,
 	.decrypt_mpdu = lib80211_wep_decrypt,
 	.encrypt_msdu = NULL,
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 48ead6f0426..26838d903b9 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -44,10 +44,10 @@ void cfg80211_send_rx_auth(struct net_device *dev, const u8 *buf, size_t len)
 		}
 	}
 
-	WARN_ON(!done);
-
-	nl80211_send_rx_auth(rdev, dev, buf, len, GFP_KERNEL);
-	cfg80211_sme_rx_auth(dev, buf, len);
+	if (done) {
+		nl80211_send_rx_auth(rdev, dev, buf, len, GFP_KERNEL);
+		cfg80211_sme_rx_auth(dev, buf, len);
+	}
 
 	wdev_unlock(wdev);
 }
@@ -149,7 +149,7 @@ void __cfg80211_send_deauth(struct net_device *dev,
 	struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf;
 	const u8 *bssid = mgmt->bssid;
 	int i;
-	bool found = false;
+	bool found = false, was_current = false;
 
 	ASSERT_WDEV_LOCK(wdev);
 
@@ -159,6 +159,7 @@ void __cfg80211_send_deauth(struct net_device *dev,
 		cfg80211_put_bss(&wdev->current_bss->pub);
 		wdev->current_bss = NULL;
 		found = true;
+		was_current = true;
 	} else for (i = 0; i < MAX_AUTH_BSSES; i++) {
 		if (wdev->auth_bsses[i] &&
 		    memcmp(wdev->auth_bsses[i]->pub.bssid, bssid, ETH_ALEN) == 0) {
@@ -183,7 +184,7 @@ void __cfg80211_send_deauth(struct net_device *dev,
 
 	nl80211_send_deauth(rdev, dev, buf, len, GFP_KERNEL);
 
-	if (wdev->sme_state == CFG80211_SME_CONNECTED) {
+	if (wdev->sme_state == CFG80211_SME_CONNECTED && was_current) {
 		u16 reason_code;
 		bool from_ap;
 
@@ -747,31 +748,53 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,
 }
 EXPORT_SYMBOL(cfg80211_new_sta);
 
-struct cfg80211_action_registration {
+struct cfg80211_mgmt_registration {
 	struct list_head list;
 
 	u32 nlpid;
 
 	int match_len;
 
+	__le16 frame_type;
+
 	u8 match[];
 };
 
-int cfg80211_mlme_register_action(struct wireless_dev *wdev, u32 snd_pid,
-				  const u8 *match_data, int match_len)
+int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
+				u16 frame_type, const u8 *match_data,
+				int match_len)
 {
-	struct cfg80211_action_registration *reg, *nreg;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	struct cfg80211_mgmt_registration *reg, *nreg;
 	int err = 0;
+	u16 mgmt_type;
+
+	if (!wdev->wiphy->mgmt_stypes)
+		return -EOPNOTSUPP;
+
+	if ((frame_type & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT)
+		return -EINVAL;
+
+	if (frame_type & ~(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE))
+		return -EINVAL;
+
+	mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4;
+	if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].rx & BIT(mgmt_type)))
+		return -EINVAL;
 
 	nreg = kzalloc(sizeof(*reg) + match_len, GFP_KERNEL);
 	if (!nreg)
 		return -ENOMEM;
 
-	spin_lock_bh(&wdev->action_registrations_lock);
+	spin_lock_bh(&wdev->mgmt_registrations_lock);
 
-	list_for_each_entry(reg, &wdev->action_registrations, list) {
+	list_for_each_entry(reg, &wdev->mgmt_registrations, list) {
 		int mlen = min(match_len, reg->match_len);
 
+		if (frame_type != le16_to_cpu(reg->frame_type))
+			continue;
+
 		if (memcmp(reg->match, match_data, mlen) == 0) {
 			err = -EALREADY;
 			break;
@@ -786,132 +809,212 @@ int cfg80211_mlme_register_action(struct wireless_dev *wdev, u32 snd_pid,
 	memcpy(nreg->match, match_data, match_len);
 	nreg->match_len = match_len;
 	nreg->nlpid = snd_pid;
-	list_add(&nreg->list, &wdev->action_registrations);
+	nreg->frame_type = cpu_to_le16(frame_type);
+	list_add(&nreg->list, &wdev->mgmt_registrations);
+
+	if (rdev->ops->mgmt_frame_register)
+		rdev->ops->mgmt_frame_register(wiphy, wdev->netdev,
+					       frame_type, true);
 
  out:
-	spin_unlock_bh(&wdev->action_registrations_lock);
+	spin_unlock_bh(&wdev->mgmt_registrations_lock);
+
 	return err;
 }
 
-void cfg80211_mlme_unregister_actions(struct wireless_dev *wdev, u32 nlpid)
+void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid)
 {
-	struct cfg80211_action_registration *reg, *tmp;
+	struct wiphy *wiphy = wdev->wiphy;
+	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
+	struct cfg80211_mgmt_registration *reg, *tmp;
+
+	spin_lock_bh(&wdev->mgmt_registrations_lock);
+
+	list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
+		if (reg->nlpid != nlpid)
+			continue;
 
-	spin_lock_bh(&wdev->action_registrations_lock);
+		if (rdev->ops->mgmt_frame_register) {
+			u16 frame_type = le16_to_cpu(reg->frame_type);
 
-	list_for_each_entry_safe(reg, tmp, &wdev->action_registrations, list) {
-		if (reg->nlpid == nlpid) {
-			list_del(&reg->list);
-			kfree(reg);
+			rdev->ops->mgmt_frame_register(wiphy, wdev->netdev,
+						       frame_type, false);
 		}
+
+		list_del(&reg->list);
+		kfree(reg);
 	}
 
-	spin_unlock_bh(&wdev->action_registrations_lock);
+	spin_unlock_bh(&wdev->mgmt_registrations_lock);
 }
 
-void cfg80211_mlme_purge_actions(struct wireless_dev *wdev)
+void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev)
 {
-	struct cfg80211_action_registration *reg, *tmp;
+	struct cfg80211_mgmt_registration *reg, *tmp;
 
-	spin_lock_bh(&wdev->action_registrations_lock);
+	spin_lock_bh(&wdev->mgmt_registrations_lock);
 
-	list_for_each_entry_safe(reg, tmp, &wdev->action_registrations, list) {
+	list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) {
 		list_del(&reg->list);
 		kfree(reg);
 	}
 
-	spin_unlock_bh(&wdev->action_registrations_lock);
+	spin_unlock_bh(&wdev->mgmt_registrations_lock);
 }
 
-int cfg80211_mlme_action(struct cfg80211_registered_device *rdev,
-			 struct net_device *dev,
-			 struct ieee80211_channel *chan,
-			 enum nl80211_channel_type channel_type,
-			 const u8 *buf, size_t len, u64 *cookie)
+int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
+			  struct net_device *dev,
+			  struct ieee80211_channel *chan,
+			  enum nl80211_channel_type channel_type,
+			  bool channel_type_valid,
+			  const u8 *buf, size_t len, u64 *cookie)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	const struct ieee80211_mgmt *mgmt;
+	u16 stype;
+
+	if (!wdev->wiphy->mgmt_stypes)
+		return -EOPNOTSUPP;
 
-	if (rdev->ops->action == NULL)
+	if (!rdev->ops->mgmt_tx)
 		return -EOPNOTSUPP;
+
 	if (len < 24 + 1)
 		return -EINVAL;
 
 	mgmt = (const struct ieee80211_mgmt *) buf;
-	if (!ieee80211_is_action(mgmt->frame_control))
+
+	if (!ieee80211_is_mgmt(mgmt->frame_control))
+		return -EINVAL;
+
+	stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE;
+	if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].tx & BIT(stype >> 4)))
 		return -EINVAL;
-	if (mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) {
-		/* Verify that we are associated with the destination AP */
-		if (!wdev->current_bss ||
-		    memcmp(wdev->current_bss->pub.bssid, mgmt->bssid,
-			   ETH_ALEN) != 0 ||
-		    memcmp(wdev->current_bss->pub.bssid, mgmt->da,
-			   ETH_ALEN) != 0)
-			return -ENOTCONN;
+
+	if (ieee80211_is_action(mgmt->frame_control) &&
+	    mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) {
+		int err = 0;
+
+		wdev_lock(wdev);
+
+		switch (wdev->iftype) {
+		case NL80211_IFTYPE_ADHOC:
+		case NL80211_IFTYPE_STATION:
+		case NL80211_IFTYPE_P2P_CLIENT:
+			if (!wdev->current_bss) {
+				err = -ENOTCONN;
+				break;
+			}
+
+			if (memcmp(wdev->current_bss->pub.bssid,
+				   mgmt->bssid, ETH_ALEN)) {
+				err = -ENOTCONN;
+				break;
+			}
+
+			/*
+			 * check for IBSS DA must be done by driver as
+			 * cfg80211 doesn't track the stations
+			 */
+			if (wdev->iftype == NL80211_IFTYPE_ADHOC)
+				break;
+
+			/* for station, check that DA is the AP */
+			if (memcmp(wdev->current_bss->pub.bssid,
+				   mgmt->da, ETH_ALEN)) {
+				err = -ENOTCONN;
+				break;
+			}
+			break;
+		case NL80211_IFTYPE_AP:
+		case NL80211_IFTYPE_P2P_GO:
+		case NL80211_IFTYPE_AP_VLAN:
+			if (memcmp(mgmt->bssid, dev->dev_addr, ETH_ALEN))
+				err = -EINVAL;
+			break;
+		default:
+			err = -EOPNOTSUPP;
+			break;
+		}
+		wdev_unlock(wdev);
+
+		if (err)
+			return err;
 	}
 
 	if (memcmp(mgmt->sa, dev->dev_addr, ETH_ALEN) != 0)
 		return -EINVAL;
 
 	/* Transmit the Action frame as requested by user space */
-	return rdev->ops->action(&rdev->wiphy, dev, chan, channel_type,
-				 buf, len, cookie);
+	return rdev->ops->mgmt_tx(&rdev->wiphy, dev, chan, channel_type,
+				  channel_type_valid, buf, len, cookie);
 }
 
-bool cfg80211_rx_action(struct net_device *dev, int freq, const u8 *buf,
-			size_t len, gfp_t gfp)
+bool cfg80211_rx_mgmt(struct net_device *dev, int freq, const u8 *buf,
+		      size_t len, gfp_t gfp)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
-	struct cfg80211_action_registration *reg;
-	const u8 *action_data;
-	int action_data_len;
+	struct cfg80211_mgmt_registration *reg;
+	const struct ieee80211_txrx_stypes *stypes =
+		&wiphy->mgmt_stypes[wdev->iftype];
+	struct ieee80211_mgmt *mgmt = (void *)buf;
+	const u8 *data;
+	int data_len;
 	bool result = false;
+	__le16 ftype = mgmt->frame_control &
+		cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE);
+	u16 stype;
 
-	/* frame length - min size excluding category */
-	action_data_len = len - (IEEE80211_MIN_ACTION_SIZE - 1);
+	stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4;
 
-	/* action data starts with category */
-	action_data = buf + IEEE80211_MIN_ACTION_SIZE - 1;
+	if (!(stypes->rx & BIT(stype)))
+		return false;
 
-	spin_lock_bh(&wdev->action_registrations_lock);
+	data = buf + ieee80211_hdrlen(mgmt->frame_control);
+	data_len = len - ieee80211_hdrlen(mgmt->frame_control);
+
+	spin_lock_bh(&wdev->mgmt_registrations_lock);
+
+	list_for_each_entry(reg, &wdev->mgmt_registrations, list) {
+		if (reg->frame_type != ftype)
+			continue;
 
-	list_for_each_entry(reg, &wdev->action_registrations, list) {
-		if (reg->match_len > action_data_len)
+		if (reg->match_len > data_len)
 			continue;
 
-		if (memcmp(reg->match, action_data, reg->match_len))
+		if (memcmp(reg->match, data, reg->match_len))
 			continue;
 
 		/* found match! */
 
 		/* Indicate the received Action frame to user space */
-		if (nl80211_send_action(rdev, dev, reg->nlpid, freq,
-					buf, len, gfp))
+		if (nl80211_send_mgmt(rdev, dev, reg->nlpid, freq,
+				      buf, len, gfp))
 			continue;
 
 		result = true;
 		break;
 	}
 
-	spin_unlock_bh(&wdev->action_registrations_lock);
+	spin_unlock_bh(&wdev->mgmt_registrations_lock);
 
 	return result;
 }
-EXPORT_SYMBOL(cfg80211_rx_action);
+EXPORT_SYMBOL(cfg80211_rx_mgmt);
 
-void cfg80211_action_tx_status(struct net_device *dev, u64 cookie,
-			       const u8 *buf, size_t len, bool ack, gfp_t gfp)
+void cfg80211_mgmt_tx_status(struct net_device *dev, u64 cookie,
+			     const u8 *buf, size_t len, bool ack, gfp_t gfp)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
 
 	/* Indicate TX status of the Action frame to user space */
-	nl80211_send_action_tx_status(rdev, dev, cookie, buf, len, ack, gfp);
+	nl80211_send_mgmt_tx_status(rdev, dev, cookie, buf, len, ack, gfp);
 }
-EXPORT_SYMBOL(cfg80211_action_tx_status);
+EXPORT_SYMBOL(cfg80211_mgmt_tx_status);
 
 void cfg80211_cqm_rssi_notify(struct net_device *dev,
 			      enum nl80211_cqm_rssi_threshold_event rssi_event,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index db71150b804..c506241f863 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -23,6 +23,11 @@
 #include "nl80211.h"
 #include "reg.h"
 
+static int nl80211_pre_doit(struct genl_ops *ops, struct sk_buff *skb,
+			    struct genl_info *info);
+static void nl80211_post_doit(struct genl_ops *ops, struct sk_buff *skb,
+			      struct genl_info *info);
+
 /* the netlink family */
 static struct genl_family nl80211_fam = {
 	.id = GENL_ID_GENERATE,	/* don't bother with a hardcoded ID */
@@ -31,6 +36,8 @@ static struct genl_family nl80211_fam = {
 	.version = 1,		/* no particular meaning now */
 	.maxattr = NL80211_ATTR_MAX,
 	.netnsok = true,
+	.pre_doit = nl80211_pre_doit,
+	.post_doit = nl80211_post_doit,
 };
 
 /* internal helper: get rdev and dev */
@@ -86,6 +93,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
 	[NL80211_ATTR_KEY_CIPHER] = { .type = NLA_U32 },
 	[NL80211_ATTR_KEY_DEFAULT] = { .type = NLA_FLAG },
 	[NL80211_ATTR_KEY_SEQ] = { .type = NLA_BINARY, .len = 8 },
+	[NL80211_ATTR_KEY_TYPE] = { .type = NLA_U32 },
 
 	[NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 },
 	[NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 },
@@ -136,6 +144,8 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
 		.len = sizeof(struct nl80211_sta_flag_update),
 	},
 	[NL80211_ATTR_CONTROL_PORT] = { .type = NLA_FLAG },
+	[NL80211_ATTR_CONTROL_PORT_ETHERTYPE] = { .type = NLA_U16 },
+	[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG },
 	[NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG },
 	[NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 },
 	[NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 },
@@ -153,9 +163,13 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
 	[NL80211_ATTR_CQM] = { .type = NLA_NESTED, },
 	[NL80211_ATTR_LOCAL_STATE_CHANGE] = { .type = NLA_FLAG },
 	[NL80211_ATTR_AP_ISOLATE] = { .type = NLA_U8 },
+
+	[NL80211_ATTR_WIPHY_TX_POWER_SETTING] = { .type = NLA_U32 },
+	[NL80211_ATTR_WIPHY_TX_POWER_LEVEL] = { .type = NLA_U32 },
+	[NL80211_ATTR_FRAME_TYPE] = { .type = NLA_U16 },
 };
 
-/* policy for the attributes */
+/* policy for the key attributes */
 static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = {
 	[NL80211_KEY_DATA] = { .type = NLA_BINARY, .len = WLAN_MAX_KEY_LEN },
 	[NL80211_KEY_IDX] = { .type = NLA_U8 },
@@ -163,6 +177,7 @@ static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = {
 	[NL80211_KEY_SEQ] = { .type = NLA_BINARY, .len = 8 },
 	[NL80211_KEY_DEFAULT] = { .type = NLA_FLAG },
 	[NL80211_KEY_DEFAULT_MGMT] = { .type = NLA_FLAG },
+	[NL80211_KEY_TYPE] = { .type = NLA_U32 },
 };
 
 /* ifidx get helper */
@@ -185,6 +200,47 @@ static int nl80211_get_ifidx(struct netlink_callback *cb)
 	return res;
 }
 
+static int nl80211_prepare_netdev_dump(struct sk_buff *skb,
+				       struct netlink_callback *cb,
+				       struct cfg80211_registered_device **rdev,
+				       struct net_device **dev)
+{
+	int ifidx = cb->args[0];
+	int err;
+
+	if (!ifidx)
+		ifidx = nl80211_get_ifidx(cb);
+	if (ifidx < 0)
+		return ifidx;
+
+	cb->args[0] = ifidx;
+
+	rtnl_lock();
+
+	*dev = __dev_get_by_index(sock_net(skb->sk), ifidx);
+	if (!*dev) {
+		err = -ENODEV;
+		goto out_rtnl;
+	}
+
+	*rdev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
+	if (IS_ERR(dev)) {
+		err = PTR_ERR(dev);
+		goto out_rtnl;
+	}
+
+	return 0;
+ out_rtnl:
+	rtnl_unlock();
+	return err;
+}
+
+static void nl80211_finish_netdev_dump(struct cfg80211_registered_device *rdev)
+{
+	cfg80211_unlock_rdev(rdev);
+	rtnl_unlock();
+}
+
 /* IE validation */
 static bool is_valid_ie_attr(const struct nlattr *attr)
 {
@@ -252,6 +308,7 @@ static int nl80211_msg_put_channel(struct sk_buff *msg,
 struct key_parse {
 	struct key_params p;
 	int idx;
+	int type;
 	bool def, defmgmt;
 };
 
@@ -282,6 +339,12 @@ static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k)
 	if (tb[NL80211_KEY_CIPHER])
 		k->p.cipher = nla_get_u32(tb[NL80211_KEY_CIPHER]);
 
+	if (tb[NL80211_KEY_TYPE]) {
+		k->type = nla_get_u32(tb[NL80211_KEY_TYPE]);
+		if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES)
+			return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -306,6 +369,12 @@ static int nl80211_parse_key_old(struct genl_info *info, struct key_parse *k)
 	k->def = !!info->attrs[NL80211_ATTR_KEY_DEFAULT];
 	k->defmgmt = !!info->attrs[NL80211_ATTR_KEY_DEFAULT_MGMT];
 
+	if (info->attrs[NL80211_ATTR_KEY_TYPE]) {
+		k->type = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);
+		if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES)
+			return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -315,6 +384,7 @@ static int nl80211_parse_key(struct genl_info *info, struct key_parse *k)
 
 	memset(k, 0, sizeof(*k));
 	k->idx = -1;
+	k->type = -1;
 
 	if (info->attrs[NL80211_ATTR_KEY])
 		err = nl80211_parse_key_new(info->attrs[NL80211_ATTR_KEY], k);
@@ -379,7 +449,7 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev,
 		} else if (parse.defmgmt)
 			goto error;
 		err = cfg80211_validate_key_settings(rdev, &parse.p,
-						     parse.idx, NULL);
+						     parse.idx, false, NULL);
 		if (err)
 			goto error;
 		result->params[parse.idx].cipher = parse.p.cipher;
@@ -398,18 +468,17 @@ static int nl80211_key_allowed(struct wireless_dev *wdev)
 {
 	ASSERT_WDEV_LOCK(wdev);
 
-	if (!netif_running(wdev->netdev))
-		return -ENETDOWN;
-
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_AP_VLAN:
+	case NL80211_IFTYPE_P2P_GO:
 		break;
 	case NL80211_IFTYPE_ADHOC:
 		if (!wdev->current_bss)
 			return -ENOLINK;
 		break;
 	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_P2P_CLIENT:
 		if (wdev->sme_state != CFG80211_SME_CONNECTED)
 			return -ENOLINK;
 		break;
@@ -434,6 +503,8 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	struct ieee80211_rate *rate;
 	int i;
 	u16 ifmodes = dev->wiphy.interface_modes;
+	const struct ieee80211_txrx_stypes *mgmt_stypes =
+				dev->wiphy.mgmt_stypes;
 
 	hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_WIPHY);
 	if (!hdr)
@@ -461,6 +532,9 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	NLA_PUT_U16(msg, NL80211_ATTR_MAX_SCAN_IE_LEN,
 		    dev->wiphy.max_scan_ie_len);
 
+	if (dev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)
+		NLA_PUT_FLAG(msg, NL80211_ATTR_SUPPORT_IBSS_RSN);
+
 	NLA_PUT(msg, NL80211_ATTR_CIPHER_SUITES,
 		sizeof(u32) * dev->wiphy.n_cipher_suites,
 		dev->wiphy.cipher_suites);
@@ -468,6 +542,9 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_PMKIDS,
 		   dev->wiphy.max_num_pmkids);
 
+	if (dev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL)
+		NLA_PUT_FLAG(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE);
+
 	nl_modes = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_IFTYPES);
 	if (!nl_modes)
 		goto nla_put_failure;
@@ -584,12 +661,13 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	CMD(flush_pmksa, FLUSH_PMKSA);
 	CMD(remain_on_channel, REMAIN_ON_CHANNEL);
 	CMD(set_bitrate_mask, SET_TX_BITRATE_MASK);
-	CMD(action, ACTION);
+	CMD(mgmt_tx, FRAME);
 	if (dev->wiphy.flags & WIPHY_FLAG_NETNS_OK) {
 		i++;
 		NLA_PUT_U32(msg, i, NL80211_CMD_SET_WIPHY_NETNS);
 	}
 	CMD(set_channel, SET_CHANNEL);
+	CMD(set_wds_peer, SET_WDS_PEER);
 
 #undef CMD
 
@@ -605,6 +683,55 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 
 	nla_nest_end(msg, nl_cmds);
 
+	if (mgmt_stypes) {
+		u16 stypes;
+		struct nlattr *nl_ftypes, *nl_ifs;
+		enum nl80211_iftype ift;
+
+		nl_ifs = nla_nest_start(msg, NL80211_ATTR_TX_FRAME_TYPES);
+		if (!nl_ifs)
+			goto nla_put_failure;
+
+		for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) {
+			nl_ftypes = nla_nest_start(msg, ift);
+			if (!nl_ftypes)
+				goto nla_put_failure;
+			i = 0;
+			stypes = mgmt_stypes[ift].tx;
+			while (stypes) {
+				if (stypes & 1)
+					NLA_PUT_U16(msg, NL80211_ATTR_FRAME_TYPE,
+						    (i << 4) | IEEE80211_FTYPE_MGMT);
+				stypes >>= 1;
+				i++;
+			}
+			nla_nest_end(msg, nl_ftypes);
+		}
+
+		nla_nest_end(msg, nl_ifs);
+
+		nl_ifs = nla_nest_start(msg, NL80211_ATTR_RX_FRAME_TYPES);
+		if (!nl_ifs)
+			goto nla_put_failure;
+
+		for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) {
+			nl_ftypes = nla_nest_start(msg, ift);
+			if (!nl_ftypes)
+				goto nla_put_failure;
+			i = 0;
+			stypes = mgmt_stypes[ift].rx;
+			while (stypes) {
+				if (stypes & 1)
+					NLA_PUT_U16(msg, NL80211_ATTR_FRAME_TYPE,
+						    (i << 4) | IEEE80211_FTYPE_MGMT);
+				stypes >>= 1;
+				i++;
+			}
+			nla_nest_end(msg, nl_ftypes);
+		}
+		nla_nest_end(msg, nl_ifs);
+	}
+
 	return genlmsg_end(msg, hdr);
 
  nla_put_failure:
@@ -641,28 +768,18 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb)
 static int nl80211_get_wiphy(struct sk_buff *skb, struct genl_info *info)
 {
 	struct sk_buff *msg;
-	struct cfg80211_registered_device *dev;
-
-	dev = cfg80211_get_dev_from_info(info);
-	if (IS_ERR(dev))
-		return PTR_ERR(dev);
+	struct cfg80211_registered_device *dev = info->user_ptr[0];
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg)
-		goto out_err;
-
-	if (nl80211_send_wiphy(msg, info->snd_pid, info->snd_seq, 0, dev) < 0)
-		goto out_free;
+		return -ENOMEM;
 
-	cfg80211_unlock_rdev(dev);
+	if (nl80211_send_wiphy(msg, info->snd_pid, info->snd_seq, 0, dev) < 0) {
+		nlmsg_free(msg);
+		return -ENOBUFS;
+	}
 
 	return genlmsg_reply(msg, info);
-
- out_free:
-	nlmsg_free(msg);
- out_err:
-	cfg80211_unlock_rdev(dev);
-	return -ENOBUFS;
 }
 
 static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = {
@@ -706,7 +823,8 @@ static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev)
 		wdev->iftype == NL80211_IFTYPE_AP ||
 		wdev->iftype == NL80211_IFTYPE_WDS ||
 		wdev->iftype == NL80211_IFTYPE_MESH_POINT ||
-		wdev->iftype == NL80211_IFTYPE_MONITOR;
+		wdev->iftype == NL80211_IFTYPE_MONITOR ||
+		wdev->iftype == NL80211_IFTYPE_P2P_GO;
 }
 
 static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
@@ -750,38 +868,48 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
 
 static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	struct net_device *netdev;
-	int result;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *netdev = info->user_ptr[1];
 
-	rtnl_lock();
+	return __nl80211_set_channel(rdev, netdev->ieee80211_ptr, info);
+}
 
-	result = get_rdev_dev_by_info_ifindex(info, &rdev, &netdev);
-	if (result)
-		goto unlock;
+static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	const u8 *bssid;
 
-	result = __nl80211_set_channel(rdev, netdev->ieee80211_ptr, info);
+	if (!info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
 
- unlock:
-	rtnl_unlock();
+	if (netif_running(dev))
+		return -EBUSY;
 
-	return result;
+	if (!rdev->ops->set_wds_peer)
+		return -EOPNOTSUPP;
+
+	if (wdev->iftype != NL80211_IFTYPE_WDS)
+		return -EOPNOTSUPP;
+
+	bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+	return rdev->ops->set_wds_peer(wdev->wiphy, dev, bssid);
 }
 
+
 static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev;
 	struct net_device *netdev = NULL;
 	struct wireless_dev *wdev;
-	int result, rem_txq_params = 0;
+	int result = 0, rem_txq_params = 0;
 	struct nlattr *nl_txq_params;
 	u32 changed;
 	u8 retry_short = 0, retry_long = 0;
 	u32 frag_threshold = 0, rts_threshold = 0;
 	u8 coverage_class = 0;
 
-	rtnl_lock();
-
 	/*
 	 * Try to find the wiphy and netdev. Normally this
 	 * function shouldn't need the netdev, but this is
@@ -808,8 +936,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 		rdev = __cfg80211_rdev_from_info(info);
 		if (IS_ERR(rdev)) {
 			mutex_unlock(&cfg80211_mutex);
-			result = PTR_ERR(rdev);
-			goto unlock;
+			return PTR_ERR(rdev);
 		}
 		wdev = NULL;
 		netdev = NULL;
@@ -869,6 +996,34 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 			goto bad_res;
 	}
 
+	if (info->attrs[NL80211_ATTR_WIPHY_TX_POWER_SETTING]) {
+		enum nl80211_tx_power_setting type;
+		int idx, mbm = 0;
+
+		if (!rdev->ops->set_tx_power) {
+			result = -EOPNOTSUPP;
+			goto bad_res;
+		}
+
+		idx = NL80211_ATTR_WIPHY_TX_POWER_SETTING;
+		type = nla_get_u32(info->attrs[idx]);
+
+		if (!info->attrs[NL80211_ATTR_WIPHY_TX_POWER_LEVEL] &&
+		    (type != NL80211_TX_POWER_AUTOMATIC)) {
+			result = -EINVAL;
+			goto bad_res;
+		}
+
+		if (type != NL80211_TX_POWER_AUTOMATIC) {
+			idx = NL80211_ATTR_WIPHY_TX_POWER_LEVEL;
+			mbm = nla_get_u32(info->attrs[idx]);
+		}
+
+		result = rdev->ops->set_tx_power(&rdev->wiphy, type, mbm);
+		if (result)
+			goto bad_res;
+	}
+
 	changed = 0;
 
 	if (info->attrs[NL80211_ATTR_WIPHY_RETRY_SHORT]) {
@@ -963,8 +1118,6 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 	mutex_unlock(&rdev->mtx);
 	if (netdev)
 		dev_put(netdev);
- unlock:
-	rtnl_unlock();
 	return result;
 }
 
@@ -1044,33 +1197,20 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
 static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info)
 {
 	struct sk_buff *msg;
-	struct cfg80211_registered_device *dev;
-	struct net_device *netdev;
-	int err;
-
-	err = get_rdev_dev_by_info_ifindex(info, &dev, &netdev);
-	if (err)
-		return err;
+	struct cfg80211_registered_device *dev = info->user_ptr[0];
+	struct net_device *netdev = info->user_ptr[1];
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg)
-		goto out_err;
+		return -ENOMEM;
 
 	if (nl80211_send_iface(msg, info->snd_pid, info->snd_seq, 0,
-			       dev, netdev) < 0)
-		goto out_free;
-
-	dev_put(netdev);
-	cfg80211_unlock_rdev(dev);
+			       dev, netdev) < 0) {
+		nlmsg_free(msg);
+		return -ENOBUFS;
+	}
 
 	return genlmsg_reply(msg, info);
-
- out_free:
-	nlmsg_free(msg);
- out_err:
-	dev_put(netdev);
-	cfg80211_unlock_rdev(dev);
-	return -ENOBUFS;
 }
 
 static const struct nla_policy mntr_flags_policy[NL80211_MNTR_FLAG_MAX + 1] = {
@@ -1107,7 +1247,7 @@ static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev,
 			       enum nl80211_iftype iftype)
 {
 	if (!use_4addr) {
-		if (netdev && netdev->br_port)
+		if (netdev && (netdev->priv_flags & IFF_BRIDGE_PORT))
 			return -EBUSY;
 		return 0;
 	}
@@ -1130,39 +1270,29 @@ static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev,
 
 static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct vif_params params;
 	int err;
 	enum nl80211_iftype otype, ntype;
-	struct net_device *dev;
+	struct net_device *dev = info->user_ptr[1];
 	u32 _flags, *flags = NULL;
 	bool change = false;
 
 	memset(&params, 0, sizeof(params));
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
 	otype = ntype = dev->ieee80211_ptr->iftype;
 
 	if (info->attrs[NL80211_ATTR_IFTYPE]) {
 		ntype = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]);
 		if (otype != ntype)
 			change = true;
-		if (ntype > NL80211_IFTYPE_MAX) {
-			err = -EINVAL;
-			goto unlock;
-		}
+		if (ntype > NL80211_IFTYPE_MAX)
+			return -EINVAL;
 	}
 
 	if (info->attrs[NL80211_ATTR_MESH_ID]) {
-		if (ntype != NL80211_IFTYPE_MESH_POINT) {
-			err = -EINVAL;
-			goto unlock;
-		}
+		if (ntype != NL80211_IFTYPE_MESH_POINT)
+			return -EINVAL;
 		params.mesh_id = nla_data(info->attrs[NL80211_ATTR_MESH_ID]);
 		params.mesh_id_len = nla_len(info->attrs[NL80211_ATTR_MESH_ID]);
 		change = true;
@@ -1173,20 +1303,18 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 		change = true;
 		err = nl80211_valid_4addr(rdev, dev, params.use_4addr, ntype);
 		if (err)
-			goto unlock;
+			return err;
 	} else {
 		params.use_4addr = -1;
 	}
 
 	if (info->attrs[NL80211_ATTR_MNTR_FLAGS]) {
-		if (ntype != NL80211_IFTYPE_MONITOR) {
-			err = -EINVAL;
-			goto unlock;
-		}
+		if (ntype != NL80211_IFTYPE_MONITOR)
+			return -EINVAL;
 		err = parse_monitor_flags(info->attrs[NL80211_ATTR_MNTR_FLAGS],
 					  &_flags);
 		if (err)
-			goto unlock;
+			return err;
 
 		flags = &_flags;
 		change = true;
@@ -1200,17 +1328,12 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info)
 	if (!err && params.use_4addr != -1)
 		dev->ieee80211_ptr->use_4addr = params.use_4addr;
 
- unlock:
-	dev_put(dev);
-	cfg80211_unlock_rdev(rdev);
- unlock_rtnl:
-	rtnl_unlock();
 	return err;
 }
 
 static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct vif_params params;
 	int err;
 	enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED;
@@ -1227,19 +1350,9 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 			return -EINVAL;
 	}
 
-	rtnl_lock();
-
-	rdev = cfg80211_get_dev_from_info(info);
-	if (IS_ERR(rdev)) {
-		err = PTR_ERR(rdev);
-		goto unlock_rtnl;
-	}
-
 	if (!rdev->ops->add_virtual_intf ||
-	    !(rdev->wiphy.interface_modes & (1 << type))) {
-		err = -EOPNOTSUPP;
-		goto unlock;
-	}
+	    !(rdev->wiphy.interface_modes & (1 << type)))
+		return -EOPNOTSUPP;
 
 	if (type == NL80211_IFTYPE_MESH_POINT &&
 	    info->attrs[NL80211_ATTR_MESH_ID]) {
@@ -1251,7 +1364,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 		params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]);
 		err = nl80211_valid_4addr(rdev, NULL, params.use_4addr, type);
 		if (err)
-			goto unlock;
+			return err;
 	}
 
 	err = parse_monitor_flags(type == NL80211_IFTYPE_MONITOR ?
@@ -1261,38 +1374,18 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
 		nla_data(info->attrs[NL80211_ATTR_IFNAME]),
 		type, err ? NULL : &flags, &params);
 
- unlock:
-	cfg80211_unlock_rdev(rdev);
- unlock_rtnl:
-	rtnl_unlock();
 	return err;
 }
 
 static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	int err;
-	struct net_device *dev;
-
-	rtnl_lock();
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
-	if (!rdev->ops->del_virtual_intf) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	err = rdev->ops->del_virtual_intf(&rdev->wiphy, dev);
+	if (!rdev->ops->del_virtual_intf)
+		return -EOPNOTSUPP;
 
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- unlock_rtnl:
-	rtnl_unlock();
-	return err;
+	return rdev->ops->del_virtual_intf(&rdev->wiphy, dev);
 }
 
 struct get_key_cookie {
@@ -1345,11 +1438,12 @@ static void get_key_callback(void *c, struct key_params *params)
 
 static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	int err;
-	struct net_device *dev;
+	struct net_device *dev = info->user_ptr[1];
 	u8 key_idx = 0;
-	u8 *mac_addr = NULL;
+	const u8 *mac_addr = NULL;
+	bool pairwise;
 	struct get_key_cookie cookie = {
 		.error = 0,
 	};
@@ -1365,30 +1459,28 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
 	if (info->attrs[NL80211_ATTR_MAC])
 		mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
-	if (!rdev->ops->get_key) {
-		err = -EOPNOTSUPP;
-		goto out;
+	pairwise = !!mac_addr;
+	if (info->attrs[NL80211_ATTR_KEY_TYPE]) {
+		u32 kt = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]);
+		if (kt >= NUM_NL80211_KEYTYPES)
+			return -EINVAL;
+		if (kt != NL80211_KEYTYPE_GROUP &&
+		    kt != NL80211_KEYTYPE_PAIRWISE)
+			return -EINVAL;
+		pairwise = kt == NL80211_KEYTYPE_PAIRWISE;
 	}
 
+	if (!rdev->ops->get_key)
+		return -EOPNOTSUPP;
+
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg) {
-		err = -ENOMEM;
-		goto out;
-	}
+	if (!msg)
+		return -ENOMEM;
 
 	hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
 			     NL80211_CMD_NEW_KEY);
-
-	if (IS_ERR(hdr)) {
-		err = PTR_ERR(hdr);
-		goto free_msg;
-	}
+	if (IS_ERR(hdr))
+		return PTR_ERR(hdr);
 
 	cookie.msg = msg;
 	cookie.idx = key_idx;
@@ -1398,8 +1490,12 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
 	if (mac_addr)
 		NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr);
 
-	err = rdev->ops->get_key(&rdev->wiphy, dev, key_idx, mac_addr,
-				&cookie, get_key_callback);
+	if (pairwise && mac_addr &&
+	    !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
+		return -ENOENT;
+
+	err = rdev->ops->get_key(&rdev->wiphy, dev, key_idx, pairwise,
+				 mac_addr, &cookie, get_key_callback);
 
 	if (err)
 		goto free_msg;
@@ -1408,28 +1504,21 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
-	err = genlmsg_reply(msg, info);
-	goto out;
+	return genlmsg_reply(msg, info);
 
  nla_put_failure:
 	err = -ENOBUFS;
  free_msg:
 	nlmsg_free(msg);
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- unlock_rtnl:
-	rtnl_unlock();
-
 	return err;
 }
 
 static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct key_parse key;
 	int err;
-	struct net_device *dev;
+	struct net_device *dev = info->user_ptr[1];
 	int (*func)(struct wiphy *wiphy, struct net_device *netdev,
 		    u8 key_index);
 
@@ -1444,21 +1533,13 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
 	if (!key.def && !key.defmgmt)
 		return -EINVAL;
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
 	if (key.def)
 		func = rdev->ops->set_default_key;
 	else
 		func = rdev->ops->set_default_mgmt_key;
 
-	if (!func) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!func)
+		return -EOPNOTSUPP;
 
 	wdev_lock(dev->ieee80211_ptr);
 	err = nl80211_key_allowed(dev->ieee80211_ptr);
@@ -1475,23 +1556,16 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
 #endif
 	wdev_unlock(dev->ieee80211_ptr);
 
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
-
- unlock_rtnl:
-	rtnl_unlock();
-
 	return err;
 }
 
 static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	int err;
-	struct net_device *dev;
+	struct net_device *dev = info->user_ptr[1];
 	struct key_parse key;
-	u8 *mac_addr = NULL;
+	const u8 *mac_addr = NULL;
 
 	err = nl80211_parse_key(info, &key);
 	if (err)
@@ -1503,43 +1577,42 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
 	if (info->attrs[NL80211_ATTR_MAC])
 		mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	rtnl_lock();
+	if (key.type == -1) {
+		if (mac_addr)
+			key.type = NL80211_KEYTYPE_PAIRWISE;
+		else
+			key.type = NL80211_KEYTYPE_GROUP;
+	}
 
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
+	/* for now */
+	if (key.type != NL80211_KEYTYPE_PAIRWISE &&
+	    key.type != NL80211_KEYTYPE_GROUP)
+		return -EINVAL;
 
-	if (!rdev->ops->add_key) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!rdev->ops->add_key)
+		return -EOPNOTSUPP;
 
-	if (cfg80211_validate_key_settings(rdev, &key.p, key.idx, mac_addr)) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (cfg80211_validate_key_settings(rdev, &key.p, key.idx,
+					   key.type == NL80211_KEYTYPE_PAIRWISE,
+					   mac_addr))
+		return -EINVAL;
 
 	wdev_lock(dev->ieee80211_ptr);
 	err = nl80211_key_allowed(dev->ieee80211_ptr);
 	if (!err)
 		err = rdev->ops->add_key(&rdev->wiphy, dev, key.idx,
+					 key.type == NL80211_KEYTYPE_PAIRWISE,
 					 mac_addr, &key.p);
 	wdev_unlock(dev->ieee80211_ptr);
 
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- unlock_rtnl:
-	rtnl_unlock();
-
 	return err;
 }
 
 static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	int err;
-	struct net_device *dev;
+	struct net_device *dev = info->user_ptr[1];
 	u8 *mac_addr = NULL;
 	struct key_parse key;
 
@@ -1550,21 +1623,32 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
 	if (info->attrs[NL80211_ATTR_MAC])
 		mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	rtnl_lock();
+	if (key.type == -1) {
+		if (mac_addr)
+			key.type = NL80211_KEYTYPE_PAIRWISE;
+		else
+			key.type = NL80211_KEYTYPE_GROUP;
+	}
 
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
+	/* for now */
+	if (key.type != NL80211_KEYTYPE_PAIRWISE &&
+	    key.type != NL80211_KEYTYPE_GROUP)
+		return -EINVAL;
 
-	if (!rdev->ops->del_key) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!rdev->ops->del_key)
+		return -EOPNOTSUPP;
 
 	wdev_lock(dev->ieee80211_ptr);
 	err = nl80211_key_allowed(dev->ieee80211_ptr);
+
+	if (key.type == NL80211_KEYTYPE_PAIRWISE && mac_addr &&
+	    !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
+		err = -ENOENT;
+
 	if (!err)
-		err = rdev->ops->del_key(&rdev->wiphy, dev, key.idx, mac_addr);
+		err = rdev->ops->del_key(&rdev->wiphy, dev, key.idx,
+					 key.type == NL80211_KEYTYPE_PAIRWISE,
+					 mac_addr);
 
 #ifdef CONFIG_CFG80211_WEXT
 	if (!err) {
@@ -1576,13 +1660,6 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info)
 #endif
 	wdev_unlock(dev->ieee80211_ptr);
 
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
-
- unlock_rtnl:
-	rtnl_unlock();
-
 	return err;
 }
 
@@ -1590,35 +1667,25 @@ static int nl80211_addset_beacon(struct sk_buff *skb, struct genl_info *info)
 {
         int (*call)(struct wiphy *wiphy, struct net_device *dev,
 		    struct beacon_parameters *info);
-	struct cfg80211_registered_device *rdev;
-	int err;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	struct beacon_parameters params;
 	int haveinfo = 0;
 
 	if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_BEACON_TAIL]))
 		return -EINVAL;
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EOPNOTSUPP;
 
 	switch (info->genlhdr->cmd) {
 	case NL80211_CMD_NEW_BEACON:
 		/* these are required for NEW_BEACON */
 		if (!info->attrs[NL80211_ATTR_BEACON_INTERVAL] ||
 		    !info->attrs[NL80211_ATTR_DTIM_PERIOD] ||
-		    !info->attrs[NL80211_ATTR_BEACON_HEAD]) {
-			err = -EINVAL;
-			goto out;
-		}
+		    !info->attrs[NL80211_ATTR_BEACON_HEAD])
+			return -EINVAL;
 
 		call = rdev->ops->add_beacon;
 		break;
@@ -1627,14 +1694,11 @@ static int nl80211_addset_beacon(struct sk_buff *skb, struct genl_info *info)
 		break;
 	default:
 		WARN_ON(1);
-		err = -EOPNOTSUPP;
-		goto out;
+		return -EOPNOTSUPP;
 	}
 
-	if (!call) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!call)
+		return -EOPNOTSUPP;
 
 	memset(&params, 0, sizeof(params));
 
@@ -1664,52 +1728,25 @@ static int nl80211_addset_beacon(struct sk_buff *skb, struct genl_info *info)
 		haveinfo = 1;
 	}
 
-	if (!haveinfo) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	err = call(&rdev->wiphy, dev, &params);
-
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- unlock_rtnl:
-	rtnl_unlock();
+	if (!haveinfo)
+		return -EINVAL;
 
-	return err;
+	return call(&rdev->wiphy, dev, &params);
 }
 
 static int nl80211_del_beacon(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	int err;
-	struct net_device *dev;
-
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 
-	if (!rdev->ops->del_beacon) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-	err = rdev->ops->del_beacon(&rdev->wiphy, dev);
+	if (!rdev->ops->del_beacon)
+		return -EOPNOTSUPP;
 
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- unlock_rtnl:
-	rtnl_unlock();
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EOPNOTSUPP;
 
-	return err;
+	return rdev->ops->del_beacon(&rdev->wiphy, dev);
 }
 
 static const struct nla_policy sta_flags_policy[NL80211_STA_FLAG_MAX + 1] = {
@@ -1830,6 +1867,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq,
 	if (sinfo->filled & STATION_INFO_TX_PACKETS)
 		NLA_PUT_U32(msg, NL80211_STA_INFO_TX_PACKETS,
 			    sinfo->tx_packets);
+	if (sinfo->filled & STATION_INFO_TX_RETRIES)
+		NLA_PUT_U32(msg, NL80211_STA_INFO_TX_RETRIES,
+			    sinfo->tx_retries);
+	if (sinfo->filled & STATION_INFO_TX_FAILED)
+		NLA_PUT_U32(msg, NL80211_STA_INFO_TX_FAILED,
+			    sinfo->tx_failed);
 	nla_nest_end(msg, sinfoattr);
 
 	return genlmsg_end(msg, hdr);
@@ -1846,28 +1889,12 @@ static int nl80211_dump_station(struct sk_buff *skb,
 	struct cfg80211_registered_device *dev;
 	struct net_device *netdev;
 	u8 mac_addr[ETH_ALEN];
-	int ifidx = cb->args[0];
 	int sta_idx = cb->args[1];
 	int err;
 
-	if (!ifidx)
-		ifidx = nl80211_get_ifidx(cb);
-	if (ifidx < 0)
-		return ifidx;
-
-	rtnl_lock();
-
-	netdev = __dev_get_by_index(sock_net(skb->sk), ifidx);
-	if (!netdev) {
-		err = -ENODEV;
-		goto out_rtnl;
-	}
-
-	dev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
-	if (IS_ERR(dev)) {
-		err = PTR_ERR(dev);
-		goto out_rtnl;
-	}
+	err = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev);
+	if (err)
+		return err;
 
 	if (!dev->ops->dump_station) {
 		err = -EOPNOTSUPP;
@@ -1897,21 +1924,19 @@ static int nl80211_dump_station(struct sk_buff *skb,
 	cb->args[1] = sta_idx;
 	err = skb->len;
  out_err:
-	cfg80211_unlock_rdev(dev);
- out_rtnl:
-	rtnl_unlock();
+	nl80211_finish_netdev_dump(dev);
 
 	return err;
 }
 
 static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	int err;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	struct station_info sinfo;
 	struct sk_buff *msg;
 	u8 *mac_addr = NULL;
+	int err;
 
 	memset(&sinfo, 0, sizeof(sinfo));
 
@@ -1920,41 +1945,24 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
 
 	mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto out_rtnl;
-
-	if (!rdev->ops->get_station) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!rdev->ops->get_station)
+		return -EOPNOTSUPP;
 
 	err = rdev->ops->get_station(&rdev->wiphy, dev, mac_addr, &sinfo);
 	if (err)
-		goto out;
+		return err;
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg)
-		goto out;
+		return -ENOMEM;
 
 	if (nl80211_send_station(msg, info->snd_pid, info->snd_seq, 0,
-				 dev, mac_addr, &sinfo) < 0)
-		goto out_free;
-
-	err = genlmsg_reply(msg, info);
-	goto out;
-
- out_free:
-	nlmsg_free(msg);
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- out_rtnl:
-	rtnl_unlock();
+				 dev, mac_addr, &sinfo) < 0) {
+		nlmsg_free(msg);
+		return -ENOBUFS;
+	}
 
-	return err;
+	return genlmsg_reply(msg, info);
 }
 
 /*
@@ -1984,9 +1992,9 @@ static int get_vlan(struct genl_info *info,
 
 static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	int err;
-	struct net_device *dev;
+	struct net_device *dev = info->user_ptr[1];
 	struct station_parameters params;
 	u8 *mac_addr = NULL;
 
@@ -2024,12 +2032,6 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 		params.plink_action =
 		    nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]);
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto out_rtnl;
-
 	err = get_vlan(info, rdev, &params.vlan);
 	if (err)
 		goto out;
@@ -2040,10 +2042,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 	switch (dev->ieee80211_ptr->iftype) {
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_AP_VLAN:
+	case NL80211_IFTYPE_P2P_GO:
 		/* disallow mesh-specific things */
 		if (params.plink_action)
 			err = -EINVAL;
 		break;
+	case NL80211_IFTYPE_P2P_CLIENT:
 	case NL80211_IFTYPE_STATION:
 		/* disallow everything but AUTHORIZED flag */
 		if (params.plink_action)
@@ -2089,19 +2093,15 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
  out:
 	if (params.vlan)
 		dev_put(params.vlan);
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- out_rtnl:
-	rtnl_unlock();
 
 	return err;
 }
 
 static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	int err;
-	struct net_device *dev;
+	struct net_device *dev = info->user_ptr[1];
 	struct station_parameters params;
 	u8 *mac_addr = NULL;
 
@@ -2138,17 +2138,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 	if (parse_station_flags(info, &params))
 		return -EINVAL;
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto out_rtnl;
-
 	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
-	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN) {
-		err = -EINVAL;
-		goto out;
-	}
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EINVAL;
 
 	err = get_vlan(info, rdev, &params.vlan);
 	if (err)
@@ -2162,61 +2155,33 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 		goto out;
 	}
 
-	if (!netif_running(dev)) {
-		err = -ENETDOWN;
-		goto out;
-	}
-
 	err = rdev->ops->add_station(&rdev->wiphy, dev, mac_addr, &params);
 
  out:
 	if (params.vlan)
 		dev_put(params.vlan);
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- out_rtnl:
-	rtnl_unlock();
-
 	return err;
 }
 
 static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	int err;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	u8 *mac_addr = NULL;
 
 	if (info->attrs[NL80211_ATTR_MAC])
 		mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto out_rtnl;
-
 	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
 	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
-	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	if (!rdev->ops->del_station) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	err = rdev->ops->del_station(&rdev->wiphy, dev, mac_addr);
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EINVAL;
 
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- out_rtnl:
-	rtnl_unlock();
+	if (!rdev->ops->del_station)
+		return -EOPNOTSUPP;
 
-	return err;
+	return rdev->ops->del_station(&rdev->wiphy, dev, mac_addr);
 }
 
 static int nl80211_send_mpath(struct sk_buff *msg, u32 pid, u32 seq,
@@ -2279,28 +2244,12 @@ static int nl80211_dump_mpath(struct sk_buff *skb,
 	struct net_device *netdev;
 	u8 dst[ETH_ALEN];
 	u8 next_hop[ETH_ALEN];
-	int ifidx = cb->args[0];
 	int path_idx = cb->args[1];
 	int err;
 
-	if (!ifidx)
-		ifidx = nl80211_get_ifidx(cb);
-	if (ifidx < 0)
-		return ifidx;
-
-	rtnl_lock();
-
-	netdev = __dev_get_by_index(sock_net(skb->sk), ifidx);
-	if (!netdev) {
-		err = -ENODEV;
-		goto out_rtnl;
-	}
-
-	dev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
-	if (IS_ERR(dev)) {
-		err = PTR_ERR(dev);
-		goto out_rtnl;
-	}
+	err = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev);
+	if (err)
+		return err;
 
 	if (!dev->ops->dump_mpath) {
 		err = -EOPNOTSUPP;
@@ -2334,18 +2283,15 @@ static int nl80211_dump_mpath(struct sk_buff *skb,
 	cb->args[1] = path_idx;
 	err = skb->len;
  out_err:
-	cfg80211_unlock_rdev(dev);
- out_rtnl:
-	rtnl_unlock();
-
+	nl80211_finish_netdev_dump(dev);
 	return err;
 }
 
 static int nl80211_get_mpath(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	int err;
-	struct net_device *dev;
+	struct net_device *dev = info->user_ptr[1];
 	struct mpath_info pinfo;
 	struct sk_buff *msg;
 	u8 *dst = NULL;
@@ -2358,53 +2304,33 @@ static int nl80211_get_mpath(struct sk_buff *skb, struct genl_info *info)
 
 	dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto out_rtnl;
-
-	if (!rdev->ops->get_mpath) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!rdev->ops->get_mpath)
+		return -EOPNOTSUPP;
 
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+		return -EOPNOTSUPP;
 
 	err = rdev->ops->get_mpath(&rdev->wiphy, dev, dst, next_hop, &pinfo);
 	if (err)
-		goto out;
+		return err;
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg)
-		goto out;
+		return -ENOMEM;
 
 	if (nl80211_send_mpath(msg, info->snd_pid, info->snd_seq, 0,
-				 dev, dst, next_hop, &pinfo) < 0)
-		goto out_free;
-
-	err = genlmsg_reply(msg, info);
-	goto out;
-
- out_free:
-	nlmsg_free(msg);
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- out_rtnl:
-	rtnl_unlock();
+				 dev, dst, next_hop, &pinfo) < 0) {
+		nlmsg_free(msg);
+		return -ENOBUFS;
+	}
 
-	return err;
+	return genlmsg_reply(msg, info);
 }
 
 static int nl80211_set_mpath(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	int err;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	u8 *dst = NULL;
 	u8 *next_hop = NULL;
 
@@ -2417,42 +2343,19 @@ static int nl80211_set_mpath(struct sk_buff *skb, struct genl_info *info)
 	dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
 	next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]);
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto out_rtnl;
-
-	if (!rdev->ops->change_mpath) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (!netif_running(dev)) {
-		err = -ENETDOWN;
-		goto out;
-	}
-
-	err = rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop);
+	if (!rdev->ops->change_mpath)
+		return -EOPNOTSUPP;
 
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- out_rtnl:
-	rtnl_unlock();
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+		return -EOPNOTSUPP;
 
-	return err;
+	return rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop);
 }
+
 static int nl80211_new_mpath(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	int err;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	u8 *dst = NULL;
 	u8 *next_hop = NULL;
 
@@ -2465,75 +2368,34 @@ static int nl80211_new_mpath(struct sk_buff *skb, struct genl_info *info)
 	dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
 	next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]);
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto out_rtnl;
-
-	if (!rdev->ops->add_mpath) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (!netif_running(dev)) {
-		err = -ENETDOWN;
-		goto out;
-	}
-
-	err = rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop);
+	if (!rdev->ops->add_mpath)
+		return -EOPNOTSUPP;
 
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- out_rtnl:
-	rtnl_unlock();
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT)
+		return -EOPNOTSUPP;
 
-	return err;
+	return rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop);
 }
 
 static int nl80211_del_mpath(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	int err;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	u8 *dst = NULL;
 
 	if (info->attrs[NL80211_ATTR_MAC])
 		dst = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto out_rtnl;
-
-	if (!rdev->ops->del_mpath) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	err = rdev->ops->del_mpath(&rdev->wiphy, dev, dst);
-
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- out_rtnl:
-	rtnl_unlock();
+	if (!rdev->ops->del_mpath)
+		return -EOPNOTSUPP;
 
-	return err;
+	return rdev->ops->del_mpath(&rdev->wiphy, dev, dst);
 }
 
 static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	int err;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	struct bss_parameters params;
 
 	memset(&params, 0, sizeof(params));
@@ -2561,31 +2423,14 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
 	if (info->attrs[NL80211_ATTR_AP_ISOLATE])
 		params.ap_isolate = !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]);
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto out_rtnl;
-
-	if (!rdev->ops->change_bss) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	err = rdev->ops->change_bss(&rdev->wiphy, dev, &params);
+	if (!rdev->ops->change_bss)
+		return -EOPNOTSUPP;
 
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- out_rtnl:
-	rtnl_unlock();
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EOPNOTSUPP;
 
-	return err;
+	return rdev->ops->change_bss(&rdev->wiphy, dev, &params);
 }
 
 static const struct nla_policy reg_rule_policy[NL80211_REG_RULE_ATTR_MAX + 1] = {
@@ -2664,37 +2509,26 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
 static int nl80211_get_mesh_params(struct sk_buff *skb,
 	struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct mesh_config cur_params;
 	int err;
-	struct net_device *dev;
+	struct net_device *dev = info->user_ptr[1];
 	void *hdr;
 	struct nlattr *pinfoattr;
 	struct sk_buff *msg;
 
-	rtnl_lock();
-
-	/* Look up our device */
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto out_rtnl;
-
-	if (!rdev->ops->get_mesh_params) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!rdev->ops->get_mesh_params)
+		return -EOPNOTSUPP;
 
 	/* Get the mesh params */
 	err = rdev->ops->get_mesh_params(&rdev->wiphy, dev, &cur_params);
 	if (err)
-		goto out;
+		return err;
 
 	/* Draw up a netlink message to send back */
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg) {
-		err = -ENOBUFS;
-		goto out;
-	}
+	if (!msg)
+		return -ENOMEM;
 	hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
 			     NL80211_CMD_GET_MESH_PARAMS);
 	if (!hdr)
@@ -2733,20 +2567,12 @@ static int nl80211_get_mesh_params(struct sk_buff *skb,
 			cur_params.dot11MeshHWMPRootMode);
 	nla_nest_end(msg, pinfoattr);
 	genlmsg_end(msg, hdr);
-	err = genlmsg_reply(msg, info);
-	goto out;
+	return genlmsg_reply(msg, info);
 
  nla_put_failure:
 	genlmsg_cancel(msg, hdr);
-	err = -EMSGSIZE;
- out:
-	/* Cleanup */
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- out_rtnl:
-	rtnl_unlock();
-
-	return err;
+	nlmsg_free(msg);
+	return -ENOBUFS;
 }
 
 #define FILL_IN_MESH_PARAM_IF_SET(table, cfg, param, mask, attr_num, nla_fn) \
@@ -2776,10 +2602,9 @@ static const struct nla_policy nl80211_meshconf_params_policy[NL80211_MESHCONF_A
 
 static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
 {
-	int err;
 	u32 mask;
-	struct cfg80211_registered_device *rdev;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	struct mesh_config cfg;
 	struct nlattr *tb[NL80211_MESHCONF_ATTR_MAX + 1];
 	struct nlattr *parent_attr;
@@ -2791,16 +2616,8 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
 			parent_attr, nl80211_meshconf_params_policy))
 		return -EINVAL;
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto out_rtnl;
-
-	if (!rdev->ops->set_mesh_params) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!rdev->ops->set_mesh_params)
+		return -EOPNOTSUPP;
 
 	/* This makes sure that there aren't more than 32 mesh config
 	 * parameters (otherwise our bitfield scheme would not work.) */
@@ -2846,16 +2663,7 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
 			nla_get_u8);
 
 	/* Apply changes */
-	err = rdev->ops->set_mesh_params(&rdev->wiphy, dev, &cfg, mask);
-
- out:
-	/* cleanup */
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- out_rtnl:
-	rtnl_unlock();
-
-	return err;
+	return rdev->ops->set_mesh_params(&rdev->wiphy, dev, &cfg, mask);
 }
 
 #undef FILL_IN_MESH_PARAM_IF_SET
@@ -2929,6 +2737,7 @@ static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info)
 
 nla_put_failure:
 	genlmsg_cancel(msg, hdr);
+	nlmsg_free(msg);
 	err = -EMSGSIZE;
 out:
 	mutex_unlock(&cfg80211_mutex);
@@ -3037,8 +2846,8 @@ static int validate_scan_freqs(struct nlattr *freqs)
 
 static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	struct cfg80211_scan_request *request;
 	struct cfg80211_ssid *ssid;
 	struct ieee80211_channel *channel;
@@ -3051,36 +2860,19 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 	if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
 		return -EINVAL;
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto out_rtnl;
-
 	wiphy = &rdev->wiphy;
 
-	if (!rdev->ops->scan) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (!netif_running(dev)) {
-		err = -ENETDOWN;
-		goto out;
-	}
+	if (!rdev->ops->scan)
+		return -EOPNOTSUPP;
 
-	if (rdev->scan_req) {
-		err = -EBUSY;
-		goto out;
-	}
+	if (rdev->scan_req)
+		return -EBUSY;
 
 	if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
 		n_channels = validate_scan_freqs(
 				info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]);
-		if (!n_channels) {
-			err = -EINVAL;
-			goto out;
-		}
+		if (!n_channels)
+			return -EINVAL;
 	} else {
 		n_channels = 0;
 
@@ -3093,29 +2885,23 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 		nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp)
 			n_ssids++;
 
-	if (n_ssids > wiphy->max_scan_ssids) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (n_ssids > wiphy->max_scan_ssids)
+		return -EINVAL;
 
 	if (info->attrs[NL80211_ATTR_IE])
 		ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
 	else
 		ie_len = 0;
 
-	if (ie_len > wiphy->max_scan_ie_len) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (ie_len > wiphy->max_scan_ie_len)
+		return -EINVAL;
 
 	request = kzalloc(sizeof(*request)
 			+ sizeof(*ssid) * n_ssids
 			+ sizeof(channel) * n_channels
 			+ ie_len, GFP_KERNEL);
-	if (!request) {
-		err = -ENOMEM;
-		goto out;
-	}
+	if (!request)
+		return -ENOMEM;
 
 	if (n_ssids)
 		request->ssids = (void *)&request->channels[n_channels];
@@ -3203,18 +2989,11 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 	if (!err) {
 		nl80211_send_scan_start(rdev, dev);
 		dev_hold(dev);
-	}
-
+	} else {
  out_free:
-	if (err) {
 		rdev->scan_req = NULL;
 		kfree(request);
 	}
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- out_rtnl:
-	rtnl_unlock();
 
 	return err;
 }
@@ -3273,6 +3052,7 @@ static int nl80211_send_bss(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 	}
 
 	switch (wdev->iftype) {
+	case NL80211_IFTYPE_P2P_CLIENT:
 	case NL80211_IFTYPE_STATION:
 		if (intbss == wdev->current_bss)
 			NLA_PUT_U32(msg, NL80211_BSS_STATUS,
@@ -3310,25 +3090,12 @@ static int nl80211_dump_scan(struct sk_buff *skb,
 	struct net_device *dev;
 	struct cfg80211_internal_bss *scan;
 	struct wireless_dev *wdev;
-	int ifidx = cb->args[0];
 	int start = cb->args[1], idx = 0;
 	int err;
 
-	if (!ifidx)
-		ifidx = nl80211_get_ifidx(cb);
-	if (ifidx < 0)
-		return ifidx;
-	cb->args[0] = ifidx;
-
-	dev = dev_get_by_index(sock_net(skb->sk), ifidx);
-	if (!dev)
-		return -ENODEV;
-
-	rdev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
-	if (IS_ERR(rdev)) {
-		err = PTR_ERR(rdev);
-		goto out_put_netdev;
-	}
+	err = nl80211_prepare_netdev_dump(skb, cb, &rdev, &dev);
+	if (err)
+		return err;
 
 	wdev = dev->ieee80211_ptr;
 
@@ -3344,21 +3111,17 @@ static int nl80211_dump_scan(struct sk_buff *skb,
 				cb->nlh->nlmsg_seq, NLM_F_MULTI,
 				rdev, wdev, scan) < 0) {
 			idx--;
-			goto out;
+			break;
 		}
 	}
 
- out:
 	spin_unlock_bh(&rdev->bss_lock);
 	wdev_unlock(wdev);
 
 	cb->args[1] = idx;
-	err = skb->len;
-	cfg80211_unlock_rdev(rdev);
- out_put_netdev:
-	dev_put(dev);
+	nl80211_finish_netdev_dump(rdev);
 
-	return err;
+	return skb->len;
 }
 
 static int nl80211_send_survey(struct sk_buff *msg, u32 pid, u32 seq,
@@ -3388,6 +3151,23 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 pid, u32 seq,
 	if (survey->filled & SURVEY_INFO_NOISE_DBM)
 		NLA_PUT_U8(msg, NL80211_SURVEY_INFO_NOISE,
 			    survey->noise);
+	if (survey->filled & SURVEY_INFO_IN_USE)
+		NLA_PUT_FLAG(msg, NL80211_SURVEY_INFO_IN_USE);
+	if (survey->filled & SURVEY_INFO_CHANNEL_TIME)
+		NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME,
+			    survey->channel_time);
+	if (survey->filled & SURVEY_INFO_CHANNEL_TIME_BUSY)
+		NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_BUSY,
+			    survey->channel_time_busy);
+	if (survey->filled & SURVEY_INFO_CHANNEL_TIME_EXT_BUSY)
+		NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_EXT_BUSY,
+			    survey->channel_time_ext_busy);
+	if (survey->filled & SURVEY_INFO_CHANNEL_TIME_RX)
+		NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_RX,
+			    survey->channel_time_rx);
+	if (survey->filled & SURVEY_INFO_CHANNEL_TIME_TX)
+		NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_TX,
+			    survey->channel_time_tx);
 
 	nla_nest_end(msg, infoattr);
 
@@ -3404,29 +3184,12 @@ static int nl80211_dump_survey(struct sk_buff *skb,
 	struct survey_info survey;
 	struct cfg80211_registered_device *dev;
 	struct net_device *netdev;
-	int ifidx = cb->args[0];
 	int survey_idx = cb->args[1];
 	int res;
 
-	if (!ifidx)
-		ifidx = nl80211_get_ifidx(cb);
-	if (ifidx < 0)
-		return ifidx;
-	cb->args[0] = ifidx;
-
-	rtnl_lock();
-
-	netdev = __dev_get_by_index(sock_net(skb->sk), ifidx);
-	if (!netdev) {
-		res = -ENODEV;
-		goto out_rtnl;
-	}
-
-	dev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx);
-	if (IS_ERR(dev)) {
-		res = PTR_ERR(dev);
-		goto out_rtnl;
-	}
+	res = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev);
+	if (res)
+		return res;
 
 	if (!dev->ops->dump_survey) {
 		res = -EOPNOTSUPP;
@@ -3454,10 +3217,7 @@ static int nl80211_dump_survey(struct sk_buff *skb,
 	cb->args[1] = survey_idx;
 	res = skb->len;
  out_err:
-	cfg80211_unlock_rdev(dev);
- out_rtnl:
-	rtnl_unlock();
-
+	nl80211_finish_netdev_dump(dev);
 	return res;
 }
 
@@ -3490,8 +3250,8 @@ static bool nl80211_valid_cipher_suite(u32 cipher)
 
 static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	struct ieee80211_channel *chan;
 	const u8 *bssid, *ssid, *ie = NULL;
 	int err, ssid_len, ie_len = 0;
@@ -3519,6 +3279,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
 		return err;
 
 	if (key.idx >= 0) {
+		if (key.type != -1 && key.type != NL80211_KEYTYPE_GROUP)
+			return -EINVAL;
 		if (!key.p.key || !key.p.key_len)
 			return -EINVAL;
 		if ((key.p.cipher != WLAN_CIPHER_SUITE_WEP40 ||
@@ -3533,34 +3295,31 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
 		key.p.key = NULL;
 	}
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
-	if (!rdev->ops->auth) {
-		err = -EOPNOTSUPP;
-		goto out;
+	if (key.idx >= 0) {
+		int i;
+		bool ok = false;
+		for (i = 0; i < rdev->wiphy.n_cipher_suites; i++) {
+			if (key.p.cipher == rdev->wiphy.cipher_suites[i]) {
+				ok = true;
+				break;
+			}
+		}
+		if (!ok)
+			return -EINVAL;
 	}
 
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!rdev->ops->auth)
+		return -EOPNOTSUPP;
 
-	if (!netif_running(dev)) {
-		err = -ENETDOWN;
-		goto out;
-	}
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
 
 	bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
 	chan = ieee80211_get_channel(&rdev->wiphy,
 		nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
-	if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED)) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED))
+		return -EINVAL;
 
 	ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
 	ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
@@ -3571,27 +3330,19 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
-	if (!nl80211_valid_auth_type(auth_type)) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (!nl80211_valid_auth_type(auth_type))
+		return -EINVAL;
 
 	local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
 
-	err = cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid,
-				 ssid, ssid_len, ie, ie_len,
-				 key.p.key, key.p.key_len, key.idx,
-				 local_state_change);
-
-out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
-unlock_rtnl:
-	rtnl_unlock();
-	return err;
+	return cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid,
+				  ssid, ssid_len, ie, ie_len,
+				  key.p.key, key.p.key_len, key.idx,
+				  local_state_change);
 }
 
-static int nl80211_crypto_settings(struct genl_info *info,
+static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
+				   struct genl_info *info,
 				   struct cfg80211_crypto_settings *settings,
 				   int cipher_limit)
 {
@@ -3599,6 +3350,19 @@ static int nl80211_crypto_settings(struct genl_info *info,
 
 	settings->control_port = info->attrs[NL80211_ATTR_CONTROL_PORT];
 
+	if (info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) {
+		u16 proto;
+		proto = nla_get_u16(
+			info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]);
+		settings->control_port_ethertype = cpu_to_be16(proto);
+		if (!(rdev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) &&
+		    proto != ETH_P_PAE)
+			return -EINVAL;
+		if (info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT])
+			settings->control_port_no_encrypt = true;
+	} else
+		settings->control_port_ethertype = cpu_to_be16(ETH_P_PAE);
+
 	if (info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]) {
 		void *data;
 		int len, i;
@@ -3658,8 +3422,8 @@ static int nl80211_crypto_settings(struct genl_info *info,
 
 static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	struct cfg80211_crypto_settings crypto;
 	struct ieee80211_channel *chan;
 	const u8 *bssid, *ssid, *ie = NULL, *prev_bssid = NULL;
@@ -3674,35 +3438,19 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 	    !info->attrs[NL80211_ATTR_WIPHY_FREQ])
 		return -EINVAL;
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
-	if (!rdev->ops->assoc) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!rdev->ops->assoc)
+		return -EOPNOTSUPP;
 
-	if (!netif_running(dev)) {
-		err = -ENETDOWN;
-		goto out;
-	}
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
 
 	bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
 	chan = ieee80211_get_channel(&rdev->wiphy,
 		nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
-	if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED)) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED))
+		return -EINVAL;
 
 	ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
 	ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
@@ -3717,35 +3465,28 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 			nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]);
 		if (mfp == NL80211_MFP_REQUIRED)
 			use_mfp = true;
-		else if (mfp != NL80211_MFP_NO) {
-			err = -EINVAL;
-			goto out;
-		}
+		else if (mfp != NL80211_MFP_NO)
+			return -EINVAL;
 	}
 
 	if (info->attrs[NL80211_ATTR_PREV_BSSID])
 		prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]);
 
-	err = nl80211_crypto_settings(info, &crypto, 1);
+	err = nl80211_crypto_settings(rdev, info, &crypto, 1);
 	if (!err)
 		err = cfg80211_mlme_assoc(rdev, dev, chan, bssid, prev_bssid,
 					  ssid, ssid_len, ie, ie_len, use_mfp,
 					  &crypto);
 
-out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
-unlock_rtnl:
-	rtnl_unlock();
 	return err;
 }
 
 static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	const u8 *ie = NULL, *bssid;
-	int err, ie_len = 0;
+	int ie_len = 0;
 	u16 reason_code;
 	bool local_state_change;
 
@@ -3758,34 +3499,19 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
 	if (!info->attrs[NL80211_ATTR_REASON_CODE])
 		return -EINVAL;
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
-	if (!rdev->ops->deauth) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!rdev->ops->deauth)
+		return -EOPNOTSUPP;
 
-	if (!netif_running(dev)) {
-		err = -ENETDOWN;
-		goto out;
-	}
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
 
 	bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
 	reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
 	if (reason_code == 0) {
 		/* Reason Code 0 is reserved */
-		err = -EINVAL;
-		goto out;
+		return -EINVAL;
 	}
 
 	if (info->attrs[NL80211_ATTR_IE]) {
@@ -3795,23 +3521,16 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
 
 	local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
 
-	err = cfg80211_mlme_deauth(rdev, dev, bssid, ie, ie_len, reason_code,
-				   local_state_change);
-
-out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
-unlock_rtnl:
-	rtnl_unlock();
-	return err;
+	return cfg80211_mlme_deauth(rdev, dev, bssid, ie, ie_len, reason_code,
+				    local_state_change);
 }
 
 static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	const u8 *ie = NULL, *bssid;
-	int err, ie_len = 0;
+	int ie_len = 0;
 	u16 reason_code;
 	bool local_state_change;
 
@@ -3824,34 +3543,19 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
 	if (!info->attrs[NL80211_ATTR_REASON_CODE])
 		return -EINVAL;
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
-	if (!rdev->ops->disassoc) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!rdev->ops->disassoc)
+		return -EOPNOTSUPP;
 
-	if (!netif_running(dev)) {
-		err = -ENETDOWN;
-		goto out;
-	}
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
 
 	bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
 	reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]);
 	if (reason_code == 0) {
 		/* Reason Code 0 is reserved */
-		err = -EINVAL;
-		goto out;
+		return -EINVAL;
 	}
 
 	if (info->attrs[NL80211_ATTR_IE]) {
@@ -3861,21 +3565,14 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
 
 	local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE];
 
-	err = cfg80211_mlme_disassoc(rdev, dev, bssid, ie, ie_len, reason_code,
-				     local_state_change);
-
-out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
-unlock_rtnl:
-	rtnl_unlock();
-	return err;
+	return cfg80211_mlme_disassoc(rdev, dev, bssid, ie, ie_len, reason_code,
+				      local_state_change);
 }
 
 static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	struct cfg80211_ibss_params ibss;
 	struct wiphy *wiphy;
 	struct cfg80211_cached_keys *connkeys = NULL;
@@ -3900,26 +3597,11 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 			return -EINVAL;
 	}
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
-	if (!rdev->ops->join_ibss) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!rdev->ops->join_ibss)
+		return -EOPNOTSUPP;
 
-	if (!netif_running(dev)) {
-		err = -ENETDOWN;
-		goto out;
-	}
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC)
+		return -EOPNOTSUPP;
 
 	wiphy = &rdev->wiphy;
 
@@ -3937,71 +3619,65 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 		nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
 	if (!ibss.channel ||
 	    ibss.channel->flags & IEEE80211_CHAN_NO_IBSS ||
-	    ibss.channel->flags & IEEE80211_CHAN_DISABLED) {
-		err = -EINVAL;
-		goto out;
-	}
+	    ibss.channel->flags & IEEE80211_CHAN_DISABLED)
+		return -EINVAL;
 
 	ibss.channel_fixed = !!info->attrs[NL80211_ATTR_FREQ_FIXED];
 	ibss.privacy = !!info->attrs[NL80211_ATTR_PRIVACY];
 
+	if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) {
+		u8 *rates =
+			nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
+		int n_rates =
+			nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
+		struct ieee80211_supported_band *sband =
+			wiphy->bands[ibss.channel->band];
+		int i, j;
+
+		if (n_rates == 0)
+			return -EINVAL;
+
+		for (i = 0; i < n_rates; i++) {
+			int rate = (rates[i] & 0x7f) * 5;
+			bool found = false;
+
+			for (j = 0; j < sband->n_bitrates; j++) {
+				if (sband->bitrates[j].bitrate == rate) {
+					found = true;
+					ibss.basic_rates |= BIT(j);
+					break;
+				}
+			}
+			if (!found)
+				return -EINVAL;
+		}
+	}
+
 	if (ibss.privacy && info->attrs[NL80211_ATTR_KEYS]) {
 		connkeys = nl80211_parse_connkeys(rdev,
 					info->attrs[NL80211_ATTR_KEYS]);
-		if (IS_ERR(connkeys)) {
-			err = PTR_ERR(connkeys);
-			connkeys = NULL;
-			goto out;
-		}
+		if (IS_ERR(connkeys))
+			return PTR_ERR(connkeys);
 	}
 
 	err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys);
-
-out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
-unlock_rtnl:
 	if (err)
 		kfree(connkeys);
-	rtnl_unlock();
 	return err;
 }
 
 static int nl80211_leave_ibss(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	struct net_device *dev;
-	int err;
-
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 
-	if (!rdev->ops->leave_ibss) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (!netif_running(dev)) {
-		err = -ENETDOWN;
-		goto out;
-	}
+	if (!rdev->ops->leave_ibss)
+		return -EOPNOTSUPP;
 
-	err = cfg80211_leave_ibss(rdev, dev, false);
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC)
+		return -EOPNOTSUPP;
 
-out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
-unlock_rtnl:
-	rtnl_unlock();
-	return err;
+	return cfg80211_leave_ibss(rdev, dev, false);
 }
 
 #ifdef CONFIG_NL80211_TESTMODE
@@ -4011,20 +3687,12 @@ static struct genl_multicast_group nl80211_testmode_mcgrp = {
 
 static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	int err;
 
 	if (!info->attrs[NL80211_ATTR_TESTDATA])
 		return -EINVAL;
 
-	rtnl_lock();
-
-	rdev = cfg80211_get_dev_from_info(info);
-	if (IS_ERR(rdev)) {
-		err = PTR_ERR(rdev);
-		goto unlock_rtnl;
-	}
-
 	err = -EOPNOTSUPP;
 	if (rdev->ops->testmode_cmd) {
 		rdev->testmode_info = info;
@@ -4034,10 +3702,6 @@ static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info)
 		rdev->testmode_info = NULL;
 	}
 
-	cfg80211_unlock_rdev(rdev);
-
- unlock_rtnl:
-	rtnl_unlock();
 	return err;
 }
 
@@ -4128,8 +3792,8 @@ EXPORT_SYMBOL(cfg80211_testmode_event);
 
 static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	struct cfg80211_connect_params connect;
 	struct wiphy *wiphy;
 	struct cfg80211_cached_keys *connkeys = NULL;
@@ -4154,25 +3818,14 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 
 	connect.privacy = info->attrs[NL80211_ATTR_PRIVACY];
 
-	err = nl80211_crypto_settings(info, &connect.crypto,
+	err = nl80211_crypto_settings(rdev, info, &connect.crypto,
 				      NL80211_MAX_NR_CIPHER_SUITES);
 	if (err)
 		return err;
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
 
-	if (!netif_running(dev)) {
-		err = -ENETDOWN;
-		goto out;
-	}
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
 
 	wiphy = &rdev->wiphy;
 
@@ -4191,39 +3844,27 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 			ieee80211_get_channel(wiphy,
 			    nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]));
 		if (!connect.channel ||
-		    connect.channel->flags & IEEE80211_CHAN_DISABLED) {
-			err = -EINVAL;
-			goto out;
-		}
+		    connect.channel->flags & IEEE80211_CHAN_DISABLED)
+			return -EINVAL;
 	}
 
 	if (connect.privacy && info->attrs[NL80211_ATTR_KEYS]) {
 		connkeys = nl80211_parse_connkeys(rdev,
 					info->attrs[NL80211_ATTR_KEYS]);
-		if (IS_ERR(connkeys)) {
-			err = PTR_ERR(connkeys);
-			connkeys = NULL;
-			goto out;
-		}
+		if (IS_ERR(connkeys))
+			return PTR_ERR(connkeys);
 	}
 
 	err = cfg80211_connect(rdev, dev, &connect, connkeys);
-
-out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
-unlock_rtnl:
 	if (err)
 		kfree(connkeys);
-	rtnl_unlock();
 	return err;
 }
 
 static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	struct net_device *dev;
-	int err;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	u16 reason;
 
 	if (!info->attrs[NL80211_ATTR_REASON_CODE])
@@ -4234,35 +3875,16 @@ static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
 	if (reason == 0)
 		return -EINVAL;
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (!netif_running(dev)) {
-		err = -ENETDOWN;
-		goto out;
-	}
-
-	err = cfg80211_disconnect(rdev, dev, reason, true);
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
 
-out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
-unlock_rtnl:
-	rtnl_unlock();
-	return err;
+	return cfg80211_disconnect(rdev, dev, reason, true);
 }
 
 static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net *net;
 	int err;
 	u32 pid;
@@ -4272,43 +3894,26 @@ static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info)
 
 	pid = nla_get_u32(info->attrs[NL80211_ATTR_PID]);
 
-	rtnl_lock();
-
-	rdev = cfg80211_get_dev_from_info(info);
-	if (IS_ERR(rdev)) {
-		err = PTR_ERR(rdev);
-		goto out_rtnl;
-	}
-
 	net = get_net_ns_by_pid(pid);
-	if (IS_ERR(net)) {
-		err = PTR_ERR(net);
-		goto out;
-	}
+	if (IS_ERR(net))
+		return PTR_ERR(net);
 
 	err = 0;
 
 	/* check if anything to do */
-	if (net_eq(wiphy_net(&rdev->wiphy), net))
-		goto out_put_net;
+	if (!net_eq(wiphy_net(&rdev->wiphy), net))
+		err = cfg80211_switch_netns(rdev, net);
 
-	err = cfg80211_switch_netns(rdev, net);
- out_put_net:
 	put_net(net);
- out:
-	cfg80211_unlock_rdev(rdev);
- out_rtnl:
-	rtnl_unlock();
 	return err;
 }
 
 static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	int (*rdev_ops)(struct wiphy *wiphy, struct net_device *dev,
 			struct cfg80211_pmksa *pmksa) = NULL;
-	int err;
-	struct net_device *dev;
+	struct net_device *dev = info->user_ptr[1];
 	struct cfg80211_pmksa pmksa;
 
 	memset(&pmksa, 0, sizeof(struct cfg80211_pmksa));
@@ -4319,19 +3924,12 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
 	if (!info->attrs[NL80211_ATTR_PMKID])
 		return -EINVAL;
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto out_rtnl;
-
 	pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]);
 	pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
 
 	switch (info->genlhdr->cmd) {
 	case NL80211_CMD_SET_PMKSA:
@@ -4345,61 +3943,32 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
 		break;
 	}
 
-	if (!rdev_ops) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	err = rdev_ops(&rdev->wiphy, dev, &pmksa);
-
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- out_rtnl:
-	rtnl_unlock();
+	if (!rdev_ops)
+		return -EOPNOTSUPP;
 
-	return err;
+	return rdev_ops(&rdev->wiphy, dev, &pmksa);
 }
 
 static int nl80211_flush_pmksa(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	int err;
-	struct net_device *dev;
-
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto out_rtnl;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (!rdev->ops->flush_pmksa) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	err = rdev->ops->flush_pmksa(&rdev->wiphy, dev);
-
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- out_rtnl:
-	rtnl_unlock();
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
 
-	return err;
+	if (!rdev->ops->flush_pmksa)
+		return -EOPNOTSUPP;
 
+	return rdev->ops->flush_pmksa(&rdev->wiphy, dev);
 }
 
 static int nl80211_remain_on_channel(struct sk_buff *skb,
 				     struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	struct ieee80211_channel *chan;
 	struct sk_buff *msg;
 	void *hdr;
@@ -4421,21 +3990,8 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
 	if (!duration || !msecs_to_jiffies(duration) || duration > 5000)
 		return -EINVAL;
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
-	if (!rdev->ops->remain_on_channel) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (!netif_running(dev)) {
-		err = -ENETDOWN;
-		goto out;
-	}
+	if (!rdev->ops->remain_on_channel)
+		return -EOPNOTSUPP;
 
 	if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) {
 		channel_type = nla_get_u32(
@@ -4443,24 +3999,18 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
 		if (channel_type != NL80211_CHAN_NO_HT &&
 		    channel_type != NL80211_CHAN_HT20 &&
 		    channel_type != NL80211_CHAN_HT40PLUS &&
-		    channel_type != NL80211_CHAN_HT40MINUS) {
-			err = -EINVAL;
-			goto out;
-		}
+		    channel_type != NL80211_CHAN_HT40MINUS)
+			return -EINVAL;
 	}
 
 	freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]);
 	chan = rdev_freq_to_chan(rdev, freq, channel_type);
-	if (chan == NULL) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (chan == NULL)
+		return -EINVAL;
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg) {
-		err = -ENOMEM;
-		goto out;
-	}
+	if (!msg)
+		return -ENOMEM;
 
 	hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
 			     NL80211_CMD_REMAIN_ON_CHANNEL);
@@ -4479,58 +4029,32 @@ static int nl80211_remain_on_channel(struct sk_buff *skb,
 	NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie);
 
 	genlmsg_end(msg, hdr);
-	err = genlmsg_reply(msg, info);
-	goto out;
+
+	return genlmsg_reply(msg, info);
 
  nla_put_failure:
 	err = -ENOBUFS;
  free_msg:
 	nlmsg_free(msg);
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- unlock_rtnl:
-	rtnl_unlock();
 	return err;
 }
 
 static int nl80211_cancel_remain_on_channel(struct sk_buff *skb,
 					    struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	u64 cookie;
-	int err;
 
 	if (!info->attrs[NL80211_ATTR_COOKIE])
 		return -EINVAL;
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
-	if (!rdev->ops->cancel_remain_on_channel) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (!netif_running(dev)) {
-		err = -ENETDOWN;
-		goto out;
-	}
+	if (!rdev->ops->cancel_remain_on_channel)
+		return -EOPNOTSUPP;
 
 	cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]);
 
-	err = rdev->ops->cancel_remain_on_channel(&rdev->wiphy, dev, cookie);
-
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- unlock_rtnl:
-	rtnl_unlock();
-	return err;
+	return rdev->ops->cancel_remain_on_channel(&rdev->wiphy, dev, cookie);
 }
 
 static u32 rateset_to_mask(struct ieee80211_supported_band *sband,
@@ -4566,26 +4090,18 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
 				       struct genl_info *info)
 {
 	struct nlattr *tb[NL80211_TXRATE_MAX + 1];
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct cfg80211_bitrate_mask mask;
-	int err, rem, i;
-	struct net_device *dev;
+	int rem, i;
+	struct net_device *dev = info->user_ptr[1];
 	struct nlattr *tx_rates;
 	struct ieee80211_supported_band *sband;
 
 	if (info->attrs[NL80211_ATTR_TX_RATES] == NULL)
 		return -EINVAL;
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
-	if (!rdev->ops->set_bitrate_mask) {
-		err = -EOPNOTSUPP;
-		goto unlock;
-	}
+	if (!rdev->ops->set_bitrate_mask)
+		return -EOPNOTSUPP;
 
 	memset(&mask, 0, sizeof(mask));
 	/* Default to all rates enabled */
@@ -4602,15 +4118,11 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
 	nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem)
 	{
 		enum ieee80211_band band = nla_type(tx_rates);
-		if (band < 0 || band >= IEEE80211_NUM_BANDS) {
-			err = -EINVAL;
-			goto unlock;
-		}
+		if (band < 0 || band >= IEEE80211_NUM_BANDS)
+			return -EINVAL;
 		sband = rdev->wiphy.bands[band];
-		if (sband == NULL) {
-			err = -EINVAL;
-			goto unlock;
-		}
+		if (sband == NULL)
+			return -EINVAL;
 		nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates),
 			  nla_len(tx_rates), nl80211_txattr_policy);
 		if (tb[NL80211_TXRATE_LEGACY]) {
@@ -4618,69 +4130,51 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb,
 				sband,
 				nla_data(tb[NL80211_TXRATE_LEGACY]),
 				nla_len(tb[NL80211_TXRATE_LEGACY]));
-			if (mask.control[band].legacy == 0) {
-				err = -EINVAL;
-				goto unlock;
-			}
+			if (mask.control[band].legacy == 0)
+				return -EINVAL;
 		}
 	}
 
-	err = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, NULL, &mask);
-
- unlock:
-	dev_put(dev);
-	cfg80211_unlock_rdev(rdev);
- unlock_rtnl:
-	rtnl_unlock();
-	return err;
+	return rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, NULL, &mask);
 }
 
-static int nl80211_register_action(struct sk_buff *skb, struct genl_info *info)
+static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	struct net_device *dev;
-	int err;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	u16 frame_type = IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION;
 
 	if (!info->attrs[NL80211_ATTR_FRAME_MATCH])
 		return -EINVAL;
 
-	if (nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]) < 1)
-		return -EINVAL;
+	if (info->attrs[NL80211_ATTR_FRAME_TYPE])
+		frame_type = nla_get_u16(info->attrs[NL80211_ATTR_FRAME_TYPE]);
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EOPNOTSUPP;
 
 	/* not much point in registering if we can't reply */
-	if (!rdev->ops->action) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!rdev->ops->mgmt_tx)
+		return -EOPNOTSUPP;
 
-	err = cfg80211_mlme_register_action(dev->ieee80211_ptr, info->snd_pid,
+	return cfg80211_mlme_register_mgmt(dev->ieee80211_ptr, info->snd_pid,
+			frame_type,
 			nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]),
 			nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]));
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
- unlock_rtnl:
-	rtnl_unlock();
-	return err;
 }
 
-static int nl80211_action(struct sk_buff *skb, struct genl_info *info)
+static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
-	struct net_device *dev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
 	struct ieee80211_channel *chan;
 	enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT;
+	bool channel_type_valid = false;
 	u32 freq;
 	int err;
 	void *hdr;
@@ -4691,26 +4185,16 @@ static int nl80211_action(struct sk_buff *skb, struct genl_info *info)
 	    !info->attrs[NL80211_ATTR_WIPHY_FREQ])
 		return -EINVAL;
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
-	if (!rdev->ops->action) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-
-	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!rdev->ops->mgmt_tx)
+		return -EOPNOTSUPP;
 
-	if (!netif_running(dev)) {
-		err = -ENETDOWN;
-		goto out;
-	}
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EOPNOTSUPP;
 
 	if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) {
 		channel_type = nla_get_u32(
@@ -4718,145 +4202,104 @@ static int nl80211_action(struct sk_buff *skb, struct genl_info *info)
 		if (channel_type != NL80211_CHAN_NO_HT &&
 		    channel_type != NL80211_CHAN_HT20 &&
 		    channel_type != NL80211_CHAN_HT40PLUS &&
-		    channel_type != NL80211_CHAN_HT40MINUS) {
-			err = -EINVAL;
-			goto out;
-		}
+		    channel_type != NL80211_CHAN_HT40MINUS)
+			return -EINVAL;
+		channel_type_valid = true;
 	}
 
 	freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]);
 	chan = rdev_freq_to_chan(rdev, freq, channel_type);
-	if (chan == NULL) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (chan == NULL)
+		return -EINVAL;
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg) {
-		err = -ENOMEM;
-		goto out;
-	}
+	if (!msg)
+		return -ENOMEM;
 
 	hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
-			     NL80211_CMD_ACTION);
+			     NL80211_CMD_FRAME);
 
 	if (IS_ERR(hdr)) {
 		err = PTR_ERR(hdr);
 		goto free_msg;
 	}
-	err = cfg80211_mlme_action(rdev, dev, chan, channel_type,
-				   nla_data(info->attrs[NL80211_ATTR_FRAME]),
-				   nla_len(info->attrs[NL80211_ATTR_FRAME]),
-				   &cookie);
+	err = cfg80211_mlme_mgmt_tx(rdev, dev, chan, channel_type,
+				    channel_type_valid,
+				    nla_data(info->attrs[NL80211_ATTR_FRAME]),
+				    nla_len(info->attrs[NL80211_ATTR_FRAME]),
+				    &cookie);
 	if (err)
 		goto free_msg;
 
 	NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie);
 
 	genlmsg_end(msg, hdr);
-	err = genlmsg_reply(msg, info);
-	goto out;
+	return genlmsg_reply(msg, info);
 
  nla_put_failure:
 	err = -ENOBUFS;
  free_msg:
 	nlmsg_free(msg);
- out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
-unlock_rtnl:
-	rtnl_unlock();
 	return err;
 }
 
 static int nl80211_set_power_save(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct wireless_dev *wdev;
-	struct net_device *dev;
+	struct net_device *dev = info->user_ptr[1];
 	u8 ps_state;
 	bool state;
 	int err;
 
-	if (!info->attrs[NL80211_ATTR_PS_STATE]) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (!info->attrs[NL80211_ATTR_PS_STATE])
+		return -EINVAL;
 
 	ps_state = nla_get_u32(info->attrs[NL80211_ATTR_PS_STATE]);
 
-	if (ps_state != NL80211_PS_DISABLED && ps_state != NL80211_PS_ENABLED) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rdev;
+	if (ps_state != NL80211_PS_DISABLED && ps_state != NL80211_PS_ENABLED)
+		return -EINVAL;
 
 	wdev = dev->ieee80211_ptr;
 
-	if (!rdev->ops->set_power_mgmt) {
-		err = -EOPNOTSUPP;
-		goto unlock_rdev;
-	}
+	if (!rdev->ops->set_power_mgmt)
+		return -EOPNOTSUPP;
 
 	state = (ps_state == NL80211_PS_ENABLED) ? true : false;
 
 	if (state == wdev->ps)
-		goto unlock_rdev;
-
-	wdev->ps = state;
-
-	if (rdev->ops->set_power_mgmt(wdev->wiphy, dev, wdev->ps,
-				      wdev->ps_timeout))
-		/* assume this means it's off */
-		wdev->ps = false;
-
-unlock_rdev:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
-	rtnl_unlock();
+		return 0;
 
-out:
+	err = rdev->ops->set_power_mgmt(wdev->wiphy, dev, state,
+					wdev->ps_timeout);
+	if (!err)
+		wdev->ps = state;
 	return err;
 }
 
 static int nl80211_get_power_save(struct sk_buff *skb, struct genl_info *info)
 {
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	enum nl80211_ps_state ps_state;
 	struct wireless_dev *wdev;
-	struct net_device *dev;
+	struct net_device *dev = info->user_ptr[1];
 	struct sk_buff *msg;
 	void *hdr;
 	int err;
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rtnl;
-
 	wdev = dev->ieee80211_ptr;
 
-	if (!rdev->ops->set_power_mgmt) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!rdev->ops->set_power_mgmt)
+		return -EOPNOTSUPP;
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg) {
-		err = -ENOMEM;
-		goto out;
-	}
+	if (!msg)
+		return -ENOMEM;
 
 	hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
 			     NL80211_CMD_GET_POWER_SAVE);
 	if (!hdr) {
-		err = -ENOMEM;
+		err = -ENOBUFS;
 		goto free_msg;
 	}
 
@@ -4868,22 +4311,12 @@ static int nl80211_get_power_save(struct sk_buff *skb, struct genl_info *info)
 	NLA_PUT_U32(msg, NL80211_ATTR_PS_STATE, ps_state);
 
 	genlmsg_end(msg, hdr);
-	err = genlmsg_reply(msg, info);
-	goto out;
+	return genlmsg_reply(msg, info);
 
-nla_put_failure:
+ nla_put_failure:
 	err = -ENOBUFS;
-
-free_msg:
+ free_msg:
 	nlmsg_free(msg);
-
-out:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
-
-unlock_rtnl:
-	rtnl_unlock();
-
 	return err;
 }
 
@@ -4897,41 +4330,24 @@ nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] __read_mostly = {
 static int nl80211_set_cqm_rssi(struct genl_info *info,
 				s32 threshold, u32 hysteresis)
 {
-	struct cfg80211_registered_device *rdev;
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct wireless_dev *wdev;
-	struct net_device *dev;
-	int err;
+	struct net_device *dev = info->user_ptr[1];
 
 	if (threshold > 0)
 		return -EINVAL;
 
-	rtnl_lock();
-
-	err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
-	if (err)
-		goto unlock_rdev;
-
 	wdev = dev->ieee80211_ptr;
 
-	if (!rdev->ops->set_cqm_rssi_config) {
-		err = -EOPNOTSUPP;
-		goto unlock_rdev;
-	}
-
-	if (wdev->iftype != NL80211_IFTYPE_STATION) {
-		err = -EOPNOTSUPP;
-		goto unlock_rdev;
-	}
-
-	err = rdev->ops->set_cqm_rssi_config(wdev->wiphy, dev,
-					     threshold, hysteresis);
+	if (!rdev->ops->set_cqm_rssi_config)
+		return -EOPNOTSUPP;
 
-unlock_rdev:
-	cfg80211_unlock_rdev(rdev);
-	dev_put(dev);
-	rtnl_unlock();
+	if (wdev->iftype != NL80211_IFTYPE_STATION &&
+	    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
+		return -EOPNOTSUPP;
 
-	return err;
+	return rdev->ops->set_cqm_rssi_config(wdev->wiphy, dev,
+					      threshold, hysteresis);
 }
 
 static int nl80211_set_cqm(struct sk_buff *skb, struct genl_info *info)
@@ -4965,6 +4381,65 @@ out:
 	return err;
 }
 
+#define NL80211_FLAG_NEED_WIPHY		0x01
+#define NL80211_FLAG_NEED_NETDEV	0x02
+#define NL80211_FLAG_NEED_RTNL		0x04
+#define NL80211_FLAG_CHECK_NETDEV_UP	0x08
+#define NL80211_FLAG_NEED_NETDEV_UP	(NL80211_FLAG_NEED_NETDEV |\
+					 NL80211_FLAG_CHECK_NETDEV_UP)
+
+static int nl80211_pre_doit(struct genl_ops *ops, struct sk_buff *skb,
+			    struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev;
+	struct net_device *dev;
+	int err;
+	bool rtnl = ops->internal_flags & NL80211_FLAG_NEED_RTNL;
+
+	if (rtnl)
+		rtnl_lock();
+
+	if (ops->internal_flags & NL80211_FLAG_NEED_WIPHY) {
+		rdev = cfg80211_get_dev_from_info(info);
+		if (IS_ERR(rdev)) {
+			if (rtnl)
+				rtnl_unlock();
+			return PTR_ERR(rdev);
+		}
+		info->user_ptr[0] = rdev;
+	} else if (ops->internal_flags & NL80211_FLAG_NEED_NETDEV) {
+		err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev);
+		if (err) {
+			if (rtnl)
+				rtnl_unlock();
+			return err;
+		}
+		if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP &&
+		    !netif_running(dev)) {
+			cfg80211_unlock_rdev(rdev);
+			dev_put(dev);
+			if (rtnl)
+				rtnl_unlock();
+			return -ENETDOWN;
+		}
+		info->user_ptr[0] = rdev;
+		info->user_ptr[1] = dev;
+	}
+
+	return 0;
+}
+
+static void nl80211_post_doit(struct genl_ops *ops, struct sk_buff *skb,
+			      struct genl_info *info)
+{
+	if (info->user_ptr[0])
+		cfg80211_unlock_rdev(info->user_ptr[0]);
+	if (info->user_ptr[1])
+		dev_put(info->user_ptr[1]);
+	if (ops->internal_flags & NL80211_FLAG_NEED_RTNL)
+		rtnl_unlock();
+}
+
 static struct genl_ops nl80211_ops[] = {
 	{
 		.cmd = NL80211_CMD_GET_WIPHY,
@@ -4972,12 +4447,14 @@ static struct genl_ops nl80211_ops[] = {
 		.dumpit = nl80211_dump_wiphy,
 		.policy = nl80211_policy,
 		/* can be retrieved by unprivileged users */
+		.internal_flags = NL80211_FLAG_NEED_WIPHY,
 	},
 	{
 		.cmd = NL80211_CMD_SET_WIPHY,
 		.doit = nl80211_set_wiphy,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_GET_INTERFACE,
@@ -4985,90 +4462,119 @@ static struct genl_ops nl80211_ops[] = {
 		.dumpit = nl80211_dump_interface,
 		.policy = nl80211_policy,
 		/* can be retrieved by unprivileged users */
+		.internal_flags = NL80211_FLAG_NEED_NETDEV,
 	},
 	{
 		.cmd = NL80211_CMD_SET_INTERFACE,
 		.doit = nl80211_set_interface,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_NEW_INTERFACE,
 		.doit = nl80211_new_interface,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_WIPHY |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_DEL_INTERFACE,
 		.doit = nl80211_del_interface,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_GET_KEY,
 		.doit = nl80211_get_key,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_SET_KEY,
 		.doit = nl80211_set_key,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_NEW_KEY,
 		.doit = nl80211_new_key,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_DEL_KEY,
 		.doit = nl80211_del_key,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_SET_BEACON,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 		.doit = nl80211_addset_beacon,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_NEW_BEACON,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 		.doit = nl80211_addset_beacon,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_DEL_BEACON,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 		.doit = nl80211_del_beacon,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_GET_STATION,
 		.doit = nl80211_get_station,
 		.dumpit = nl80211_dump_station,
 		.policy = nl80211_policy,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_SET_STATION,
 		.doit = nl80211_set_station,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_NEW_STATION,
 		.doit = nl80211_new_station,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_DEL_STATION,
 		.doit = nl80211_del_station,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_GET_MPATH,
@@ -5076,30 +4582,40 @@ static struct genl_ops nl80211_ops[] = {
 		.dumpit = nl80211_dump_mpath,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_SET_MPATH,
 		.doit = nl80211_set_mpath,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_NEW_MPATH,
 		.doit = nl80211_new_mpath,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_DEL_MPATH,
 		.doit = nl80211_del_mpath,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_SET_BSS,
 		.doit = nl80211_set_bss,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_GET_REG,
@@ -5124,18 +4640,24 @@ static struct genl_ops nl80211_ops[] = {
 		.doit = nl80211_get_mesh_params,
 		.policy = nl80211_policy,
 		/* can be retrieved by unprivileged users */
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_SET_MESH_PARAMS,
 		.doit = nl80211_set_mesh_params,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_TRIGGER_SCAN,
 		.doit = nl80211_trigger_scan,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_GET_SCAN,
@@ -5147,36 +4669,48 @@ static struct genl_ops nl80211_ops[] = {
 		.doit = nl80211_authenticate,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_ASSOCIATE,
 		.doit = nl80211_associate,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_DEAUTHENTICATE,
 		.doit = nl80211_deauthenticate,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_DISASSOCIATE,
 		.doit = nl80211_disassociate,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_JOIN_IBSS,
 		.doit = nl80211_join_ibss,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_LEAVE_IBSS,
 		.doit = nl80211_leave_ibss,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 #ifdef CONFIG_NL80211_TESTMODE
 	{
@@ -5184,6 +4718,8 @@ static struct genl_ops nl80211_ops[] = {
 		.doit = nl80211_testmode_do,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_WIPHY |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 #endif
 	{
@@ -5191,18 +4727,24 @@ static struct genl_ops nl80211_ops[] = {
 		.doit = nl80211_connect,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_DISCONNECT,
 		.doit = nl80211_disconnect,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_SET_WIPHY_NETNS,
 		.doit = nl80211_wiphy_netns,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_WIPHY |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_GET_SURVEY,
@@ -5214,72 +4756,104 @@ static struct genl_ops nl80211_ops[] = {
 		.doit = nl80211_setdel_pmksa,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_DEL_PMKSA,
 		.doit = nl80211_setdel_pmksa,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_FLUSH_PMKSA,
 		.doit = nl80211_flush_pmksa,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_REMAIN_ON_CHANNEL,
 		.doit = nl80211_remain_on_channel,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL,
 		.doit = nl80211_cancel_remain_on_channel,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_SET_TX_BITRATE_MASK,
 		.doit = nl80211_set_tx_bitrate_mask,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
-		.cmd = NL80211_CMD_REGISTER_ACTION,
-		.doit = nl80211_register_action,
+		.cmd = NL80211_CMD_REGISTER_FRAME,
+		.doit = nl80211_register_mgmt,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
-		.cmd = NL80211_CMD_ACTION,
-		.doit = nl80211_action,
+		.cmd = NL80211_CMD_FRAME,
+		.doit = nl80211_tx_mgmt,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_SET_POWER_SAVE,
 		.doit = nl80211_set_power_save,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_GET_POWER_SAVE,
 		.doit = nl80211_get_power_save,
 		.policy = nl80211_policy,
 		/* can be retrieved by unprivileged users */
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_SET_CQM,
 		.doit = nl80211_set_cqm,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 	{
 		.cmd = NL80211_CMD_SET_CHANNEL,
 		.doit = nl80211_set_channel,
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_SET_WDS_PEER,
+		.doit = nl80211_set_wds_peer,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV |
+				  NL80211_FLAG_NEED_RTNL,
 	},
 };
 
@@ -5953,9 +5527,9 @@ void nl80211_send_sta_event(struct cfg80211_registered_device *rdev,
 				nl80211_mlme_mcgrp.id, gfp);
 }
 
-int nl80211_send_action(struct cfg80211_registered_device *rdev,
-			struct net_device *netdev, u32 nlpid,
-			int freq, const u8 *buf, size_t len, gfp_t gfp)
+int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
+		      struct net_device *netdev, u32 nlpid,
+		      int freq, const u8 *buf, size_t len, gfp_t gfp)
 {
 	struct sk_buff *msg;
 	void *hdr;
@@ -5965,7 +5539,7 @@ int nl80211_send_action(struct cfg80211_registered_device *rdev,
 	if (!msg)
 		return -ENOMEM;
 
-	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_ACTION);
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME);
 	if (!hdr) {
 		nlmsg_free(msg);
 		return -ENOMEM;
@@ -5993,10 +5567,10 @@ int nl80211_send_action(struct cfg80211_registered_device *rdev,
 	return -ENOBUFS;
 }
 
-void nl80211_send_action_tx_status(struct cfg80211_registered_device *rdev,
-				   struct net_device *netdev, u64 cookie,
-				   const u8 *buf, size_t len, bool ack,
-				   gfp_t gfp)
+void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev,
+				 struct net_device *netdev, u64 cookie,
+				 const u8 *buf, size_t len, bool ack,
+				 gfp_t gfp)
 {
 	struct sk_buff *msg;
 	void *hdr;
@@ -6005,7 +5579,7 @@ void nl80211_send_action_tx_status(struct cfg80211_registered_device *rdev,
 	if (!msg)
 		return;
 
-	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_ACTION_TX_STATUS);
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME_TX_STATUS);
 	if (!hdr) {
 		nlmsg_free(msg);
 		return;
@@ -6092,7 +5666,7 @@ static int nl80211_netlink_notify(struct notifier_block * nb,
 
 	list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list)
 		list_for_each_entry_rcu(wdev, &rdev->netdev_list, list)
-			cfg80211_mlme_unregister_actions(wdev, notify->pid);
+			cfg80211_mlme_unregister_socket(wdev, notify->pid);
 
 	rcu_read_unlock();
 
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 2ad7fbc7d9f..30d2f939150 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -74,13 +74,13 @@ void nl80211_send_sta_event(struct cfg80211_registered_device *rdev,
 			    struct net_device *dev, const u8 *mac_addr,
 			    struct station_info *sinfo, gfp_t gfp);
 
-int nl80211_send_action(struct cfg80211_registered_device *rdev,
-			struct net_device *netdev, u32 nlpid, int freq,
-			const u8 *buf, size_t len, gfp_t gfp);
-void nl80211_send_action_tx_status(struct cfg80211_registered_device *rdev,
-				   struct net_device *netdev, u64 cookie,
-				   const u8 *buf, size_t len, bool ack,
-				   gfp_t gfp);
+int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
+		      struct net_device *netdev, u32 nlpid, int freq,
+		      const u8 *buf, size_t len, gfp_t gfp);
+void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev,
+				 struct net_device *netdev, u64 cookie,
+				 const u8 *buf, size_t len, bool ack,
+				 gfp_t gfp);
 
 void
 nl80211_send_cqm_rssi_notify(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c
index 1332c445d1c..dbe35e138e9 100644
--- a/net/wireless/radiotap.c
+++ b/net/wireless/radiotap.c
@@ -14,6 +14,7 @@
  * See COPYING for more details.
  */
 
+#include <linux/kernel.h>
 #include <net/cfg80211.h>
 #include <net/ieee80211_radiotap.h>
 #include <asm/unaligned.h>
@@ -45,7 +46,7 @@ static const struct radiotap_align_size rtap_namespace_sizes[] = {
 };
 
 static const struct ieee80211_radiotap_namespace radiotap_ns = {
-	.n_bits = sizeof(rtap_namespace_sizes) / sizeof(rtap_namespace_sizes[0]),
+	.n_bits = ARRAY_SIZE(rtap_namespace_sizes),
 	.align_size = rtap_namespace_sizes,
 };
 
@@ -200,7 +201,7 @@ int ieee80211_radiotap_iterator_next(
 {
 	while (1) {
 		int hit = 0;
-		int pad, align, size, subns, vnslen;
+		int pad, align, size, subns;
 		uint32_t oui;
 
 		/* if no more EXT bits, that's it */
@@ -260,6 +261,27 @@ int ieee80211_radiotap_iterator_next(
 		if (pad)
 			iterator->_arg += align - pad;
 
+		if (iterator->_arg_index % 32 == IEEE80211_RADIOTAP_VENDOR_NAMESPACE) {
+			int vnslen;
+
+			if ((unsigned long)iterator->_arg + size -
+			    (unsigned long)iterator->_rtheader >
+			    (unsigned long)iterator->_max_length)
+				return -EINVAL;
+
+			oui = (*iterator->_arg << 16) |
+				(*(iterator->_arg + 1) << 8) |
+				*(iterator->_arg + 2);
+			subns = *(iterator->_arg + 3);
+
+			find_ns(iterator, oui, subns);
+
+			vnslen = get_unaligned_le16(iterator->_arg + 4);
+			iterator->_next_ns_data = iterator->_arg + size + vnslen;
+			if (!iterator->current_namespace)
+				size += vnslen;
+		}
+
 		/*
 		 * this is what we will return to user, but we need to
 		 * move on first so next call has something fresh to test
@@ -286,40 +308,25 @@ int ieee80211_radiotap_iterator_next(
 		/* these special ones are valid in each bitmap word */
 		switch (iterator->_arg_index % 32) {
 		case IEEE80211_RADIOTAP_VENDOR_NAMESPACE:
-			iterator->_bitmap_shifter >>= 1;
-			iterator->_arg_index++;
-
 			iterator->_reset_on_ext = 1;
 
-			vnslen = get_unaligned_le16(iterator->this_arg + 4);
-			iterator->_next_ns_data = iterator->_arg + vnslen;
-			oui = (*iterator->this_arg << 16) |
-				(*(iterator->this_arg + 1) << 8) |
-				*(iterator->this_arg + 2);
-			subns = *(iterator->this_arg + 3);
-
-			find_ns(iterator, oui, subns);
-
 			iterator->is_radiotap_ns = 0;
-			/* allow parsers to show this information */
+			/*
+			 * If parser didn't register this vendor
+			 * namespace with us, allow it to show it
+			 * as 'raw. Do do that, set argument index
+			 * to vendor namespace.
+			 */
 			iterator->this_arg_index =
 				IEEE80211_RADIOTAP_VENDOR_NAMESPACE;
-			iterator->this_arg_size += vnslen;
-			if ((unsigned long)iterator->this_arg +
-			    iterator->this_arg_size -
-			    (unsigned long)iterator->_rtheader >
-			    (unsigned long)(unsigned long)iterator->_max_length)
-				return -EINVAL;
-			hit = 1;
-			break;
+			if (!iterator->current_namespace)
+				hit = 1;
+			goto next_entry;
 		case IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE:
-			iterator->_bitmap_shifter >>= 1;
-			iterator->_arg_index++;
-
 			iterator->_reset_on_ext = 1;
 			iterator->current_namespace = &radiotap_ns;
 			iterator->is_radiotap_ns = 1;
-			break;
+			goto next_entry;
 		case IEEE80211_RADIOTAP_EXT:
 			/*
 			 * bit 31 was set, there is more
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 8f0d97dd310..4b9f8912526 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -36,6 +36,7 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/random.h>
+#include <linux/ctype.h>
 #include <linux/nl80211.h>
 #include <linux/platform_device.h>
 #include <net/cfg80211.h>
@@ -67,21 +68,17 @@ static struct platform_device *reg_pdev;
 const struct ieee80211_regdomain *cfg80211_regdomain;
 
 /*
- * We use this as a place for the rd structure built from the
- * last parsed country IE to rest until CRDA gets back to us with
- * what it thinks should apply for the same country
- */
-static const struct ieee80211_regdomain *country_ie_regdomain;
-
-/*
  * Protects static reg.c components:
  *     - cfg80211_world_regdom
  *     - cfg80211_regdom
- *     - country_ie_regdomain
  *     - last_request
  */
-DEFINE_MUTEX(reg_mutex);
-#define assert_reg_lock() WARN_ON(!mutex_is_locked(&reg_mutex))
+static DEFINE_MUTEX(reg_mutex);
+
+static inline void assert_reg_lock(void)
+{
+	lockdep_assert_held(&reg_mutex);
+}
 
 /* Used to queue up regulatory hints */
 static LIST_HEAD(reg_requests_list);
@@ -189,14 +186,6 @@ static bool is_alpha2_set(const char *alpha2)
 	return false;
 }
 
-static bool is_alpha_upper(char letter)
-{
-	/* ASCII A - Z */
-	if (letter >= 65 && letter <= 90)
-		return true;
-	return false;
-}
-
 static bool is_unknown_alpha2(const char *alpha2)
 {
 	if (!alpha2)
@@ -228,7 +217,7 @@ static bool is_an_alpha2(const char *alpha2)
 {
 	if (!alpha2)
 		return false;
-	if (is_alpha_upper(alpha2[0]) && is_alpha_upper(alpha2[1]))
+	if (isalpha(alpha2[0]) && isalpha(alpha2[1]))
 		return true;
 	return false;
 }
@@ -275,25 +264,6 @@ static bool is_user_regdom_saved(void)
 	return true;
 }
 
-/**
- * country_ie_integrity_changes - tells us if the country IE has changed
- * @checksum: checksum of country IE of fields we are interested in
- *
- * If the country IE has not changed you can ignore it safely. This is
- * useful to determine if two devices are seeing two different country IEs
- * even on the same alpha2. Note that this will return false if no IE has
- * been set on the wireless core yet.
- */
-static bool country_ie_integrity_changes(u32 checksum)
-{
-	/* If no IE has been set then the checksum doesn't change */
-	if (unlikely(!last_request->country_ie_checksum))
-		return false;
-	if (unlikely(last_request->country_ie_checksum != checksum))
-		return true;
-	return false;
-}
-
 static int reg_copy_regd(const struct ieee80211_regdomain **dst_regd,
 			 const struct ieee80211_regdomain *src_regd)
 {
@@ -506,471 +476,6 @@ static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range,
 }
 
 /*
- * This is a work around for sanity checking ieee80211_channel_to_frequency()'s
- * work. ieee80211_channel_to_frequency() can for example currently provide a
- * 2 GHz channel when in fact a 5 GHz channel was desired. An example would be
- * an AP providing channel 8 on a country IE triplet when it sent this on the
- * 5 GHz band, that channel is designed to be channel 8 on 5 GHz, not a 2 GHz
- * channel.
- *
- * This can be removed once ieee80211_channel_to_frequency() takes in a band.
- */
-static bool chan_in_band(int chan, enum ieee80211_band band)
-{
-	int center_freq = ieee80211_channel_to_frequency(chan);
-
-	switch (band) {
-	case IEEE80211_BAND_2GHZ:
-		if (center_freq <= 2484)
-			return true;
-		return false;
-	case IEEE80211_BAND_5GHZ:
-		if (center_freq >= 5005)
-			return true;
-		return false;
-	default:
-		return false;
-	}
-}
-
-/*
- * Some APs may send a country IE triplet for each channel they
- * support and while this is completely overkill and silly we still
- * need to support it. We avoid making a single rule for each channel
- * though and to help us with this we use this helper to find the
- * actual subband end channel. These type of country IE triplet
- * scenerios are handled then, all yielding two regulaotry rules from
- * parsing a country IE:
- *
- * [1]
- * [2]
- * [36]
- * [40]
- *
- * [1]
- * [2-4]
- * [5-12]
- * [36]
- * [40-44]
- *
- * [1-4]
- * [5-7]
- * [36-44]
- * [48-64]
- *
- * [36-36]
- * [40-40]
- * [44-44]
- * [48-48]
- * [52-52]
- * [56-56]
- * [60-60]
- * [64-64]
- * [100-100]
- * [104-104]
- * [108-108]
- * [112-112]
- * [116-116]
- * [120-120]
- * [124-124]
- * [128-128]
- * [132-132]
- * [136-136]
- * [140-140]
- *
- * Returns 0 if the IE has been found to be invalid in the middle
- * somewhere.
- */
-static int max_subband_chan(enum ieee80211_band band,
-			    int orig_cur_chan,
-			    int orig_end_channel,
-			    s8 orig_max_power,
-			    u8 **country_ie,
-			    u8 *country_ie_len)
-{
-	u8 *triplets_start = *country_ie;
-	u8 len_at_triplet = *country_ie_len;
-	int end_subband_chan = orig_end_channel;
-
-	/*
-	 * We'll deal with padding for the caller unless
-	 * its not immediate and we don't process any channels
-	 */
-	if (*country_ie_len == 1) {
-		*country_ie += 1;
-		*country_ie_len -= 1;
-		return orig_end_channel;
-	}
-
-	/* Move to the next triplet and then start search */
-	*country_ie += 3;
-	*country_ie_len -= 3;
-
-	if (!chan_in_band(orig_cur_chan, band))
-		return 0;
-
-	while (*country_ie_len >= 3) {
-		int end_channel = 0;
-		struct ieee80211_country_ie_triplet *triplet =
-			(struct ieee80211_country_ie_triplet *) *country_ie;
-		int cur_channel = 0, next_expected_chan;
-
-		/* means last triplet is completely unrelated to this one */
-		if (triplet->ext.reg_extension_id >=
-				IEEE80211_COUNTRY_EXTENSION_ID) {
-			*country_ie -= 3;
-			*country_ie_len += 3;
-			break;
-		}
-
-		if (triplet->chans.first_channel == 0) {
-			*country_ie += 1;
-			*country_ie_len -= 1;
-			if (*country_ie_len != 0)
-				return 0;
-			break;
-		}
-
-		if (triplet->chans.num_channels == 0)
-			return 0;
-
-		/* Monitonically increasing channel order */
-		if (triplet->chans.first_channel <= end_subband_chan)
-			return 0;
-
-		if (!chan_in_band(triplet->chans.first_channel, band))
-			return 0;
-
-		/* 2 GHz */
-		if (triplet->chans.first_channel <= 14) {
-			end_channel = triplet->chans.first_channel +
-				triplet->chans.num_channels - 1;
-		}
-		else {
-			end_channel =  triplet->chans.first_channel +
-				(4 * (triplet->chans.num_channels - 1));
-		}
-
-		if (!chan_in_band(end_channel, band))
-			return 0;
-
-		if (orig_max_power != triplet->chans.max_power) {
-			*country_ie -= 3;
-			*country_ie_len += 3;
-			break;
-		}
-
-		cur_channel = triplet->chans.first_channel;
-
-		/* The key is finding the right next expected channel */
-		if (band == IEEE80211_BAND_2GHZ)
-			next_expected_chan = end_subband_chan + 1;
-		 else
-			next_expected_chan = end_subband_chan + 4;
-
-		if (cur_channel != next_expected_chan) {
-			*country_ie -= 3;
-			*country_ie_len += 3;
-			break;
-		}
-
-		end_subband_chan = end_channel;
-
-		/* Move to the next one */
-		*country_ie += 3;
-		*country_ie_len -= 3;
-
-		/*
-		 * Padding needs to be dealt with if we processed
-		 * some channels.
-		 */
-		if (*country_ie_len == 1) {
-			*country_ie += 1;
-			*country_ie_len -= 1;
-			break;
-		}
-
-		/* If seen, the IE is invalid */
-		if (*country_ie_len == 2)
-			return 0;
-	}
-
-	if (end_subband_chan == orig_end_channel) {
-		*country_ie = triplets_start;
-		*country_ie_len = len_at_triplet;
-		return orig_end_channel;
-	}
-
-	return end_subband_chan;
-}
-
-/*
- * Converts a country IE to a regulatory domain. A regulatory domain
- * structure has a lot of information which the IE doesn't yet have,
- * so for the other values we use upper max values as we will intersect
- * with our userspace regulatory agent to get lower bounds.
- */
-static struct ieee80211_regdomain *country_ie_2_rd(
-				enum ieee80211_band band,
-				u8 *country_ie,
-				u8 country_ie_len,
-				u32 *checksum)
-{
-	struct ieee80211_regdomain *rd = NULL;
-	unsigned int i = 0;
-	char alpha2[2];
-	u32 flags = 0;
-	u32 num_rules = 0, size_of_regd = 0;
-	u8 *triplets_start = NULL;
-	u8 len_at_triplet = 0;
-	/* the last channel we have registered in a subband (triplet) */
-	int last_sub_max_channel = 0;
-
-	*checksum = 0xDEADBEEF;
-
-	/* Country IE requirements */
-	BUG_ON(country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN ||
-		country_ie_len & 0x01);
-
-	alpha2[0] = country_ie[0];
-	alpha2[1] = country_ie[1];
-
-	/*
-	 * Third octet can be:
-	 *    'I' - Indoor
-	 *    'O' - Outdoor
-	 *
-	 *  anything else we assume is no restrictions
-	 */
-	if (country_ie[2] == 'I')
-		flags = NL80211_RRF_NO_OUTDOOR;
-	else if (country_ie[2] == 'O')
-		flags = NL80211_RRF_NO_INDOOR;
-
-	country_ie += 3;
-	country_ie_len -= 3;
-
-	triplets_start = country_ie;
-	len_at_triplet = country_ie_len;
-
-	*checksum ^= ((flags ^ alpha2[0] ^ alpha2[1]) << 8);
-
-	/*
-	 * We need to build a reg rule for each triplet, but first we must
-	 * calculate the number of reg rules we will need. We will need one
-	 * for each channel subband
-	 */
-	while (country_ie_len >= 3) {
-		int end_channel = 0;
-		struct ieee80211_country_ie_triplet *triplet =
-			(struct ieee80211_country_ie_triplet *) country_ie;
-		int cur_sub_max_channel = 0, cur_channel = 0;
-
-		if (triplet->ext.reg_extension_id >=
-				IEEE80211_COUNTRY_EXTENSION_ID) {
-			country_ie += 3;
-			country_ie_len -= 3;
-			continue;
-		}
-
-		/*
-		 * APs can add padding to make length divisible
-		 * by two, required by the spec.
-		 */
-		if (triplet->chans.first_channel == 0) {
-			country_ie++;
-			country_ie_len--;
-			/* This is expected to be at the very end only */
-			if (country_ie_len != 0)
-				return NULL;
-			break;
-		}
-
-		if (triplet->chans.num_channels == 0)
-			return NULL;
-
-		if (!chan_in_band(triplet->chans.first_channel, band))
-			return NULL;
-
-		/* 2 GHz */
-		if (band == IEEE80211_BAND_2GHZ)
-			end_channel = triplet->chans.first_channel +
-				triplet->chans.num_channels - 1;
-		else
-			/*
-			 * 5 GHz -- For example in country IEs if the first
-			 * channel given is 36 and the number of channels is 4
-			 * then the individual channel numbers defined for the
-			 * 5 GHz PHY by these parameters are: 36, 40, 44, and 48
-			 * and not 36, 37, 38, 39.
-			 *
-			 * See: http://tinyurl.com/11d-clarification
-			 */
-			end_channel =  triplet->chans.first_channel +
-				(4 * (triplet->chans.num_channels - 1));
-
-		cur_channel = triplet->chans.first_channel;
-
-		/*
-		 * Enhancement for APs that send a triplet for every channel
-		 * or for whatever reason sends triplets with multiple channels
-		 * separated when in fact they should be together.
-		 */
-		end_channel = max_subband_chan(band,
-					       cur_channel,
-					       end_channel,
-					       triplet->chans.max_power,
-					       &country_ie,
-					       &country_ie_len);
-		if (!end_channel)
-			return NULL;
-
-		if (!chan_in_band(end_channel, band))
-			return NULL;
-
-		cur_sub_max_channel = end_channel;
-
-		/* Basic sanity check */
-		if (cur_sub_max_channel < cur_channel)
-			return NULL;
-
-		/*
-		 * Do not allow overlapping channels. Also channels
-		 * passed in each subband must be monotonically
-		 * increasing
-		 */
-		if (last_sub_max_channel) {
-			if (cur_channel <= last_sub_max_channel)
-				return NULL;
-			if (cur_sub_max_channel <= last_sub_max_channel)
-				return NULL;
-		}
-
-		/*
-		 * When dot11RegulatoryClassesRequired is supported
-		 * we can throw ext triplets as part of this soup,
-		 * for now we don't care when those change as we
-		 * don't support them
-		 */
-		*checksum ^= ((cur_channel ^ cur_sub_max_channel) << 8) |
-		  ((cur_sub_max_channel ^ cur_sub_max_channel) << 16) |
-		  ((triplet->chans.max_power ^ cur_sub_max_channel) << 24);
-
-		last_sub_max_channel = cur_sub_max_channel;
-
-		num_rules++;
-
-		if (country_ie_len >= 3) {
-			country_ie += 3;
-			country_ie_len -= 3;
-		}
-
-		/*
-		 * Note: this is not a IEEE requirement but
-		 * simply a memory requirement
-		 */
-		if (num_rules > NL80211_MAX_SUPP_REG_RULES)
-			return NULL;
-	}
-
-	country_ie = triplets_start;
-	country_ie_len = len_at_triplet;
-
-	size_of_regd = sizeof(struct ieee80211_regdomain) +
-		(num_rules * sizeof(struct ieee80211_reg_rule));
-
-	rd = kzalloc(size_of_regd, GFP_KERNEL);
-	if (!rd)
-		return NULL;
-
-	rd->n_reg_rules = num_rules;
-	rd->alpha2[0] = alpha2[0];
-	rd->alpha2[1] = alpha2[1];
-
-	/* This time around we fill in the rd */
-	while (country_ie_len >= 3) {
-		int end_channel = 0;
-		struct ieee80211_country_ie_triplet *triplet =
-			(struct ieee80211_country_ie_triplet *) country_ie;
-		struct ieee80211_reg_rule *reg_rule = NULL;
-		struct ieee80211_freq_range *freq_range = NULL;
-		struct ieee80211_power_rule *power_rule = NULL;
-
-		/*
-		 * Must parse if dot11RegulatoryClassesRequired is true,
-		 * we don't support this yet
-		 */
-		if (triplet->ext.reg_extension_id >=
-				IEEE80211_COUNTRY_EXTENSION_ID) {
-			country_ie += 3;
-			country_ie_len -= 3;
-			continue;
-		}
-
-		if (triplet->chans.first_channel == 0) {
-			country_ie++;
-			country_ie_len--;
-			break;
-		}
-
-		reg_rule = &rd->reg_rules[i];
-		freq_range = &reg_rule->freq_range;
-		power_rule = &reg_rule->power_rule;
-
-		reg_rule->flags = flags;
-
-		/* 2 GHz */
-		if (band == IEEE80211_BAND_2GHZ)
-			end_channel = triplet->chans.first_channel +
-				triplet->chans.num_channels -1;
-		else
-			end_channel =  triplet->chans.first_channel +
-				(4 * (triplet->chans.num_channels - 1));
-
-		end_channel = max_subband_chan(band,
-					       triplet->chans.first_channel,
-					       end_channel,
-					       triplet->chans.max_power,
-					       &country_ie,
-					       &country_ie_len);
-
-		/*
-		 * The +10 is since the regulatory domain expects
-		 * the actual band edge, not the center of freq for
-		 * its start and end freqs, assuming 20 MHz bandwidth on
-		 * the channels passed
-		 */
-		freq_range->start_freq_khz =
-			MHZ_TO_KHZ(ieee80211_channel_to_frequency(
-				triplet->chans.first_channel) - 10);
-		freq_range->end_freq_khz =
-			MHZ_TO_KHZ(ieee80211_channel_to_frequency(
-				end_channel) + 10);
-
-		/*
-		 * These are large arbitrary values we use to intersect later.
-		 * Increment this if we ever support >= 40 MHz channels
-		 * in IEEE 802.11
-		 */
-		freq_range->max_bandwidth_khz = MHZ_TO_KHZ(40);
-		power_rule->max_antenna_gain = DBI_TO_MBI(100);
-		power_rule->max_eirp = DBM_TO_MBM(triplet->chans.max_power);
-
-		i++;
-
-		if (country_ie_len >= 3) {
-			country_ie += 3;
-			country_ie_len -= 3;
-		}
-
-		BUG_ON(i > NL80211_MAX_SUPP_REG_RULES);
-	}
-
-	return rd;
-}
-
-
-/*
  * Helper for regdom_intersect(), this does the real
  * mathematical intersection fun
  */
@@ -1191,7 +696,6 @@ static int freq_reg_info_regd(struct wiphy *wiphy,
 
 	return -EINVAL;
 }
-EXPORT_SYMBOL(freq_reg_info);
 
 int freq_reg_info(struct wiphy *wiphy,
 		  u32 center_freq,
@@ -1205,6 +709,7 @@ int freq_reg_info(struct wiphy *wiphy,
 				  reg_rule,
 				  NULL);
 }
+EXPORT_SYMBOL(freq_reg_info);
 
 /*
  * Note that right now we assume the desired channel bandwidth
@@ -1243,41 +748,8 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
 			  desired_bw_khz,
 			  &reg_rule);
 
-	if (r) {
-		/*
-		 * This means no regulatory rule was found in the country IE
-		 * with a frequency range on the center_freq's band, since
-		 * IEEE-802.11 allows for a country IE to have a subset of the
-		 * regulatory information provided in a country we ignore
-		 * disabling the channel unless at least one reg rule was
-		 * found on the center_freq's band. For details see this
-		 * clarification:
-		 *
-		 * http://tinyurl.com/11d-clarification
-		 */
-		if (r == -ERANGE &&
-		    last_request->initiator ==
-		    NL80211_REGDOM_SET_BY_COUNTRY_IE) {
-			REG_DBG_PRINT("cfg80211: Leaving channel %d MHz "
-				"intact on %s - no rule found in band on "
-				"Country IE\n",
-			chan->center_freq, wiphy_name(wiphy));
-		} else {
-		/*
-		 * In this case we know the country IE has at least one reg rule
-		 * for the band so we respect its band definitions
-		 */
-			if (last_request->initiator ==
-			    NL80211_REGDOM_SET_BY_COUNTRY_IE)
-				REG_DBG_PRINT("cfg80211: Disabling "
-					"channel %d MHz on %s due to "
-					"Country IE\n",
-					chan->center_freq, wiphy_name(wiphy));
-			flags |= IEEE80211_CHAN_DISABLED;
-			chan->flags = flags;
-		}
+	if (r)
 		return;
-	}
 
 	power_rule = &reg_rule->power_rule;
 	freq_range = &reg_rule->freq_range;
@@ -1695,7 +1167,7 @@ static int ignore_request(struct wiphy *wiphy,
 				return 0;
 			return -EALREADY;
 		}
-		return REG_INTERSECT;
+		return 0;
 	case NL80211_REGDOM_SET_BY_DRIVER:
 		if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE) {
 			if (regdom_changes(pending_request->alpha2))
@@ -1831,6 +1303,7 @@ static void reg_process_hint(struct regulatory_request *reg_request)
 {
 	int r = 0;
 	struct wiphy *wiphy = NULL;
+	enum nl80211_reg_initiator initiator = reg_request->initiator;
 
 	BUG_ON(!reg_request->alpha2);
 
@@ -1850,7 +1323,7 @@ static void reg_process_hint(struct regulatory_request *reg_request)
 	/* This is required so that the orig_* parameters are saved */
 	if (r == -EALREADY && wiphy &&
 	    wiphy->flags & WIPHY_FLAG_STRICT_REGULATORY)
-		wiphy_update_regulatory(wiphy, reg_request->initiator);
+		wiphy_update_regulatory(wiphy, initiator);
 out:
 	mutex_unlock(&reg_mutex);
 	mutex_unlock(&cfg80211_mutex);
@@ -1923,6 +1396,11 @@ static DECLARE_WORK(reg_work, reg_todo);
 
 static void queue_regulatory_request(struct regulatory_request *request)
 {
+	if (isalpha(request->alpha2[0]))
+		request->alpha2[0] = toupper(request->alpha2[0]);
+	if (isalpha(request->alpha2[1]))
+		request->alpha2[1] = toupper(request->alpha2[1]);
+
 	spin_lock(&reg_requests_lock);
 	list_add_tail(&request->list, &reg_requests_list);
 	spin_unlock(&reg_requests_lock);
@@ -2008,35 +1486,6 @@ int regulatory_hint(struct wiphy *wiphy, const char *alpha2)
 }
 EXPORT_SYMBOL(regulatory_hint);
 
-/* Caller must hold reg_mutex */
-static bool reg_same_country_ie_hint(struct wiphy *wiphy,
-			u32 country_ie_checksum)
-{
-	struct wiphy *request_wiphy;
-
-	assert_reg_lock();
-
-	if (unlikely(last_request->initiator !=
-	    NL80211_REGDOM_SET_BY_COUNTRY_IE))
-		return false;
-
-	request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
-
-	if (!request_wiphy)
-		return false;
-
-	if (likely(request_wiphy != wiphy))
-		return !country_ie_integrity_changes(country_ie_checksum);
-	/*
-	 * We should not have let these through at this point, they
-	 * should have been picked up earlier by the first alpha2 check
-	 * on the device
-	 */
-	if (WARN_ON(!country_ie_integrity_changes(country_ie_checksum)))
-		return true;
-	return false;
-}
-
 /*
  * We hold wdev_lock() here so we cannot hold cfg80211_mutex() and
  * therefore cannot iterate over the rdev list here.
@@ -2046,9 +1495,7 @@ void regulatory_hint_11d(struct wiphy *wiphy,
 			 u8 *country_ie,
 			 u8 country_ie_len)
 {
-	struct ieee80211_regdomain *rd = NULL;
 	char alpha2[2];
-	u32 checksum = 0;
 	enum environment_cap env = ENVIRON_ANY;
 	struct regulatory_request *request;
 
@@ -2064,14 +1511,6 @@ void regulatory_hint_11d(struct wiphy *wiphy,
 	if (country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN)
 		goto out;
 
-	/*
-	 * Pending country IE processing, this can happen after we
-	 * call CRDA and wait for a response if a beacon was received before
-	 * we were able to process the last regulatory_hint_11d() call
-	 */
-	if (country_ie_regdomain)
-		goto out;
-
 	alpha2[0] = country_ie[0];
 	alpha2[1] = country_ie[1];
 
@@ -2090,39 +1529,14 @@ void regulatory_hint_11d(struct wiphy *wiphy,
 	    wiphy_idx_valid(last_request->wiphy_idx)))
 		goto out;
 
-	rd = country_ie_2_rd(band, country_ie, country_ie_len, &checksum);
-	if (!rd) {
-		REG_DBG_PRINT("cfg80211: Ignoring bogus country IE\n");
-		goto out;
-	}
-
-	/*
-	 * This will not happen right now but we leave it here for the
-	 * the future when we want to add suspend/resume support and having
-	 * the user move to another country after doing so, or having the user
-	 * move to another AP. Right now we just trust the first AP.
-	 *
-	 * If we hit this before we add this support we want to be informed of
-	 * it as it would indicate a mistake in the current design
-	 */
-	if (WARN_ON(reg_same_country_ie_hint(wiphy, checksum)))
-		goto free_rd_out;
-
 	request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
 	if (!request)
-		goto free_rd_out;
-
-	/*
-	 * We keep this around for when CRDA comes back with a response so
-	 * we can intersect with that
-	 */
-	country_ie_regdomain = rd;
+		goto out;
 
 	request->wiphy_idx = get_wiphy_idx(wiphy);
-	request->alpha2[0] = rd->alpha2[0];
-	request->alpha2[1] = rd->alpha2[1];
+	request->alpha2[0] = alpha2[0];
+	request->alpha2[1] = alpha2[1];
 	request->initiator = NL80211_REGDOM_SET_BY_COUNTRY_IE;
-	request->country_ie_checksum = checksum;
 	request->country_ie_env = env;
 
 	mutex_unlock(&reg_mutex);
@@ -2131,8 +1545,6 @@ void regulatory_hint_11d(struct wiphy *wiphy,
 
 	return;
 
-free_rd_out:
-	kfree(rd);
 out:
 	mutex_unlock(&reg_mutex);
 }
@@ -2383,33 +1795,6 @@ static void print_regdomain_info(const struct ieee80211_regdomain *rd)
 	print_rd_rules(rd);
 }
 
-#ifdef CONFIG_CFG80211_REG_DEBUG
-static void reg_country_ie_process_debug(
-	const struct ieee80211_regdomain *rd,
-	const struct ieee80211_regdomain *country_ie_regdomain,
-	const struct ieee80211_regdomain *intersected_rd)
-{
-	printk(KERN_DEBUG "cfg80211: Received country IE:\n");
-	print_regdomain_info(country_ie_regdomain);
-	printk(KERN_DEBUG "cfg80211: CRDA thinks this should applied:\n");
-	print_regdomain_info(rd);
-	if (intersected_rd) {
-		printk(KERN_DEBUG "cfg80211: We intersect both of these "
-			"and get:\n");
-		print_regdomain_info(intersected_rd);
-		return;
-	}
-	printk(KERN_DEBUG "cfg80211: Intersection between both failed\n");
-}
-#else
-static inline void reg_country_ie_process_debug(
-	const struct ieee80211_regdomain *rd,
-	const struct ieee80211_regdomain *country_ie_regdomain,
-	const struct ieee80211_regdomain *intersected_rd)
-{
-}
-#endif
-
 /* Takes ownership of rd only if it doesn't fail */
 static int __set_regdom(const struct ieee80211_regdomain *rd)
 {
@@ -2521,34 +1906,6 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 		return 0;
 	}
 
-	/*
-	 * Country IE requests are handled a bit differently, we intersect
-	 * the country IE rd with what CRDA believes that country should have
-	 */
-
-	/*
-	 * Userspace could have sent two replies with only
-	 * one kernel request. By the second reply we would have
-	 * already processed and consumed the country_ie_regdomain.
-	 */
-	if (!country_ie_regdomain)
-		return -EALREADY;
-	BUG_ON(rd == country_ie_regdomain);
-
-	/*
-	 * Intersect what CRDA returned and our what we
-	 * had built from the Country IE received
-	 */
-
-	intersected_rd = regdom_intersect(rd, country_ie_regdomain);
-
-	reg_country_ie_process_debug(rd,
-				     country_ie_regdomain,
-				     intersected_rd);
-
-	kfree(country_ie_regdomain);
-	country_ie_regdomain = NULL;
-
 	if (!intersected_rd)
 		return -EINVAL;
 
@@ -2630,7 +1987,7 @@ out:
 	mutex_unlock(&reg_mutex);
 }
 
-int regulatory_init(void)
+int __init regulatory_init(void)
 {
 	int err = 0;
 
@@ -2676,7 +2033,7 @@ int regulatory_init(void)
 	return 0;
 }
 
-void regulatory_exit(void)
+void /* __init_or_exit */ regulatory_exit(void)
 {
 	struct regulatory_request *reg_request, *tmp;
 	struct reg_beacon *reg_beacon, *btmp;
@@ -2688,9 +2045,6 @@ void regulatory_exit(void)
 
 	reset_regdomains();
 
-	kfree(country_ie_regdomain);
-	country_ie_regdomain = NULL;
-
 	kfree(last_request);
 
 	platform_device_unregister(reg_pdev);
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
index b26224a9f3b..c4695d07af2 100644
--- a/net/wireless/reg.h
+++ b/net/wireless/reg.h
@@ -10,7 +10,7 @@ int regulatory_hint_user(const char *alpha2);
 
 void reg_device_remove(struct wiphy *wiphy);
 
-int regulatory_init(void);
+int __init regulatory_init(void);
 void regulatory_exit(void);
 
 int set_regdom(const struct ieee80211_regdomain *rd);
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 58401d246bd..503ebb86ba1 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -275,6 +275,7 @@ struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy,
 {
 	struct cfg80211_registered_device *dev = wiphy_to_dev(wiphy);
 	struct cfg80211_internal_bss *bss, *res = NULL;
+	unsigned long now = jiffies;
 
 	spin_lock_bh(&dev->bss_lock);
 
@@ -283,6 +284,10 @@ struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy,
 			continue;
 		if (channel && bss->pub.channel != channel)
 			continue;
+		/* Don't get expired BSS structs */
+		if (time_after(now, bss->ts + IEEE80211_SCAN_RESULT_EXPIRE) &&
+		    !atomic_read(&bss->hold))
+			continue;
 		if (is_bss(&bss->pub, bssid, ssid, ssid_len)) {
 			res = bss;
 			kref_get(&res->ref);
@@ -645,14 +650,14 @@ void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub)
 	bss = container_of(pub, struct cfg80211_internal_bss, pub);
 
 	spin_lock_bh(&dev->bss_lock);
+	if (!list_empty(&bss->list)) {
+		list_del_init(&bss->list);
+		dev->bss_generation++;
+		rb_erase(&bss->rbn, &dev->bss_tree);
 
-	list_del(&bss->list);
-	dev->bss_generation++;
-	rb_erase(&bss->rbn, &dev->bss_tree);
-
+		kref_put(&bss->ref, bss_release);
+	}
 	spin_unlock_bh(&dev->bss_lock);
-
-	kref_put(&bss->ref, bss_release);
 }
 EXPORT_SYMBOL(cfg80211_unlink_bss);
 
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 72222f0074d..e17b0bee6bd 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -35,7 +35,7 @@ struct cfg80211_conn {
 	bool auto_auth, prev_bssid_valid;
 };
 
-bool cfg80211_is_all_idle(void)
+static bool cfg80211_is_all_idle(void)
 {
 	struct cfg80211_registered_device *rdev;
 	struct wireless_dev *wdev;
@@ -411,7 +411,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 
 	ASSERT_WDEV_LOCK(wdev);
 
-	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
+		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
 		return;
 
 	if (wdev->sme_state != CFG80211_SME_CONNECTING)
@@ -548,7 +549,8 @@ void __cfg80211_roamed(struct wireless_dev *wdev, const u8 *bssid,
 
 	ASSERT_WDEV_LOCK(wdev);
 
-	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
+		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
 		return;
 
 	if (wdev->sme_state != CFG80211_SME_CONNECTED)
@@ -644,7 +646,8 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 
 	ASSERT_WDEV_LOCK(wdev);
 
-	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION))
+	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
+		    wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
 		return;
 
 	if (wdev->sme_state != CFG80211_SME_CONNECTED)
@@ -695,7 +698,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 	 */
 	if (rdev->ops->del_key)
 		for (i = 0; i < 6; i++)
-			rdev->ops->del_key(wdev->wiphy, dev, i, NULL);
+			rdev->ops->del_key(wdev->wiphy, dev, i, false, NULL);
 
 #ifdef CONFIG_CFG80211_WEXT
 	memset(&wrqu, 0, sizeof(wrqu));
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index 9f2cef3e0ca..4294fa22bb2 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -35,6 +35,14 @@ SHOW_FMT(index, "%d", wiphy_idx);
 SHOW_FMT(macaddress, "%pM", wiphy.perm_addr);
 SHOW_FMT(address_mask, "%pM", wiphy.addr_mask);
 
+static ssize_t name_show(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf) {
+	struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy;
+	return sprintf(buf, "%s\n", dev_name(&wiphy->dev));
+}
+
+
 static ssize_t addresses_show(struct device *dev,
 			      struct device_attribute *attr,
 			      char *buf)
@@ -57,6 +65,7 @@ static struct device_attribute ieee80211_dev_attrs[] = {
 	__ATTR_RO(macaddress),
 	__ATTR_RO(address_mask),
 	__ATTR_RO(addresses),
+	__ATTR_RO(name),
 	{}
 };
 
@@ -110,6 +119,13 @@ static int wiphy_resume(struct device *dev)
 	return ret;
 }
 
+static const void *wiphy_namespace(struct device *d)
+{
+	struct wiphy *wiphy = container_of(d, struct wiphy, dev);
+
+	return wiphy_net(wiphy);
+}
+
 struct class ieee80211_class = {
 	.name = "ieee80211",
 	.owner = THIS_MODULE,
@@ -120,6 +136,8 @@ struct class ieee80211_class = {
 #endif
 	.suspend = wiphy_suspend,
 	.resume = wiphy_resume,
+	.ns_type = &net_ns_type_operations,
+	.namespace = wiphy_namespace,
 };
 
 int wiphy_sysfs_init(void)
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 3416373a9c0..76120aeda57 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -144,19 +144,25 @@ void ieee80211_set_bitrate_flags(struct wiphy *wiphy)
 
 int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
 				   struct key_params *params, int key_idx,
-				   const u8 *mac_addr)
+				   bool pairwise, const u8 *mac_addr)
 {
 	int i;
 
 	if (key_idx > 5)
 		return -EINVAL;
 
+	if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
+		return -EINVAL;
+
+	if (pairwise && !mac_addr)
+		return -EINVAL;
+
 	/*
 	 * Disallow pairwise keys with non-zero index unless it's WEP
 	 * (because current deployments use pairwise WEP keys with
 	 * non-zero indizes but 802.11i clearly specifies to use zero)
 	 */
-	if (mac_addr && key_idx &&
+	if (pairwise && key_idx &&
 	    params->cipher != WLAN_CIPHER_SUITE_WEP40 &&
 	    params->cipher != WLAN_CIPHER_SUITE_WEP104)
 		return -EINVAL;
@@ -183,7 +189,14 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
 			return -EINVAL;
 		break;
 	default:
-		return -EINVAL;
+		/*
+		 * We don't know anything about this algorithm,
+		 * allow using it -- but the driver must check
+		 * all parameters! We still check below whether
+		 * or not the driver supports this algorithm,
+		 * of course.
+		 */
+		break;
 	}
 
 	if (params->seq) {
@@ -221,7 +234,7 @@ const unsigned char bridge_tunnel_header[] __aligned(2) =
 	{ 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 };
 EXPORT_SYMBOL(bridge_tunnel_header);
 
-unsigned int ieee80211_hdrlen(__le16 fc)
+unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc)
 {
 	unsigned int hdrlen = 24;
 
@@ -319,7 +332,8 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
 		cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) {
 	case cpu_to_le16(IEEE80211_FCTL_TODS):
 		if (unlikely(iftype != NL80211_IFTYPE_AP &&
-			     iftype != NL80211_IFTYPE_AP_VLAN))
+			     iftype != NL80211_IFTYPE_AP_VLAN &&
+			     iftype != NL80211_IFTYPE_P2P_GO))
 			return -1;
 		break;
 	case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS):
@@ -347,7 +361,8 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
 		break;
 	case cpu_to_le16(IEEE80211_FCTL_FROMDS):
 		if ((iftype != NL80211_IFTYPE_STATION &&
-		    iftype != NL80211_IFTYPE_MESH_POINT) ||
+		     iftype != NL80211_IFTYPE_P2P_CLIENT &&
+		     iftype != NL80211_IFTYPE_MESH_POINT) ||
 		    (is_multicast_ether_addr(dst) &&
 		     !compare_ether_addr(src, addr)))
 			return -1;
@@ -424,6 +439,7 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
 	switch (iftype) {
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_AP_VLAN:
+	case NL80211_IFTYPE_P2P_GO:
 		fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS);
 		/* DA BSSID SA */
 		memcpy(hdr.addr1, skb->data, ETH_ALEN);
@@ -432,6 +448,7 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
 		hdrlen = 24;
 		break;
 	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_P2P_CLIENT:
 		fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
 		/* BSSID SA DA */
 		memcpy(hdr.addr1, bssid, ETH_ALEN);
@@ -666,7 +683,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
 	for (i = 0; i < 6; i++) {
 		if (!wdev->connect_keys->params[i].cipher)
 			continue;
-		if (rdev->ops->add_key(wdev->wiphy, dev, i, NULL,
+		if (rdev->ops->add_key(wdev->wiphy, dev, i, false, NULL,
 					&wdev->connect_keys->params[i])) {
 			printk(KERN_ERR "%s: failed to set key %d\n",
 				dev->name, i);
@@ -770,8 +787,10 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 		return -EOPNOTSUPP;
 
 	/* if it's part of a bridge, reject changing type to station/ibss */
-	if (dev->br_port && (ntype == NL80211_IFTYPE_ADHOC ||
-			     ntype == NL80211_IFTYPE_STATION))
+	if ((dev->priv_flags & IFF_BRIDGE_PORT) &&
+	    (ntype == NL80211_IFTYPE_ADHOC ||
+	     ntype == NL80211_IFTYPE_STATION ||
+	     ntype == NL80211_IFTYPE_P2P_CLIENT))
 		return -EBUSY;
 
 	if (ntype != otype) {
@@ -782,6 +801,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 			cfg80211_leave_ibss(rdev, dev, false);
 			break;
 		case NL80211_IFTYPE_STATION:
+		case NL80211_IFTYPE_P2P_CLIENT:
 			cfg80211_disconnect(rdev, dev,
 					    WLAN_REASON_DEAUTH_LEAVING, true);
 			break;
@@ -810,9 +830,11 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 			if (dev->ieee80211_ptr->use_4addr)
 				break;
 			/* fall through */
+		case NL80211_IFTYPE_P2P_CLIENT:
 		case NL80211_IFTYPE_ADHOC:
 			dev->priv_flags |= IFF_DONT_BRIDGE;
 			break;
+		case NL80211_IFTYPE_P2P_GO:
 		case NL80211_IFTYPE_AP:
 		case NL80211_IFTYPE_AP_VLAN:
 		case NL80211_IFTYPE_WDS:
@@ -823,7 +845,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
 			/* monitor can't bridge anyway */
 			break;
 		case NL80211_IFTYPE_UNSPECIFIED:
-		case __NL80211_IFTYPE_AFTER_LAST:
+		case NUM_NL80211_IFTYPES:
 			/* not happening */
 			break;
 		}
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 96342993cf9..12222ee6ebf 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -432,14 +432,17 @@ int cfg80211_wext_giwretry(struct net_device *dev,
 EXPORT_SYMBOL_GPL(cfg80211_wext_giwretry);
 
 static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
-				     struct net_device *dev, const u8 *addr,
-				     bool remove, bool tx_key, int idx,
-				     struct key_params *params)
+				     struct net_device *dev, bool pairwise,
+				     const u8 *addr, bool remove, bool tx_key,
+				     int idx, struct key_params *params)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int err, i;
 	bool rejoin = false;
 
+	if (pairwise && !addr)
+		return -EINVAL;
+
 	if (!wdev->wext.keys) {
 		wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys),
 					      GFP_KERNEL);
@@ -478,7 +481,13 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 				__cfg80211_leave_ibss(rdev, wdev->netdev, true);
 				rejoin = true;
 			}
-			err = rdev->ops->del_key(&rdev->wiphy, dev, idx, addr);
+
+			if (!pairwise && addr &&
+			    !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
+				err = -ENOENT;
+			else
+				err = rdev->ops->del_key(&rdev->wiphy, dev, idx,
+							 pairwise, addr);
 		}
 		wdev->wext.connect.privacy = false;
 		/*
@@ -507,12 +516,13 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 	if (addr)
 		tx_key = false;
 
-	if (cfg80211_validate_key_settings(rdev, params, idx, addr))
+	if (cfg80211_validate_key_settings(rdev, params, idx, pairwise, addr))
 		return -EINVAL;
 
 	err = 0;
 	if (wdev->current_bss)
-		err = rdev->ops->add_key(&rdev->wiphy, dev, idx, addr, params);
+		err = rdev->ops->add_key(&rdev->wiphy, dev, idx,
+					 pairwise, addr, params);
 	if (err)
 		return err;
 
@@ -563,17 +573,17 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 }
 
 static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
-				   struct net_device *dev, const u8 *addr,
-				   bool remove, bool tx_key, int idx,
-				   struct key_params *params)
+				   struct net_device *dev, bool pairwise,
+				   const u8 *addr, bool remove, bool tx_key,
+				   int idx, struct key_params *params)
 {
 	int err;
 
 	/* devlist mutex needed for possible IBSS re-join */
 	mutex_lock(&rdev->devlist_mtx);
 	wdev_lock(dev->ieee80211_ptr);
-	err = __cfg80211_set_encryption(rdev, dev, addr, remove,
-					tx_key, idx, params);
+	err = __cfg80211_set_encryption(rdev, dev, pairwise, addr,
+					remove, tx_key, idx, params);
 	wdev_unlock(dev->ieee80211_ptr);
 	mutex_unlock(&rdev->devlist_mtx);
 
@@ -635,7 +645,7 @@ int cfg80211_wext_siwencode(struct net_device *dev,
 	else if (!remove)
 		return -EINVAL;
 
-	return cfg80211_set_encryption(rdev, dev, NULL, remove,
+	return cfg80211_set_encryption(rdev, dev, false, NULL, remove,
 				       wdev->wext.default_key == -1,
 				       idx, &params);
 }
@@ -725,7 +735,9 @@ int cfg80211_wext_siwencodeext(struct net_device *dev,
 	}
 
 	return cfg80211_set_encryption(
-			rdev, dev, addr, remove,
+			rdev, dev,
+			!(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY),
+			addr, remove,
 			ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY,
 			idx, &params);
 }
@@ -829,7 +841,7 @@ int cfg80211_wext_siwtxpower(struct net_device *dev,
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_dev(wdev->wiphy);
-	enum tx_power_setting type;
+	enum nl80211_tx_power_setting type;
 	int dbm = 0;
 
 	if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM)
@@ -852,7 +864,7 @@ int cfg80211_wext_siwtxpower(struct net_device *dev,
 			if (data->txpower.value < 0)
 				return -EINVAL;
 			dbm = data->txpower.value;
-			type = TX_POWER_FIXED;
+			type = NL80211_TX_POWER_FIXED;
 			/* TODO: do regulatory check! */
 		} else {
 			/*
@@ -860,10 +872,10 @@ int cfg80211_wext_siwtxpower(struct net_device *dev,
 			 * passed in from userland.
 			 */
 			if (data->txpower.value < 0) {
-				type = TX_POWER_AUTOMATIC;
+				type = NL80211_TX_POWER_AUTOMATIC;
 			} else {
 				dbm = data->txpower.value;
-				type = TX_POWER_LIMITED;
+				type = NL80211_TX_POWER_LIMITED;
 			}
 		}
 	} else {
@@ -872,7 +884,7 @@ int cfg80211_wext_siwtxpower(struct net_device *dev,
 		return 0;
 	}
 
-	return rdev->ops->set_tx_power(wdev->wiphy, type, dbm);
+	return rdev->ops->set_tx_power(wdev->wiphy, type, DBM_TO_MBM(dbm));
 }
 EXPORT_SYMBOL_GPL(cfg80211_wext_siwtxpower);
 
@@ -1354,6 +1366,10 @@ struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
 	}
 
 	wstats.qual.updated |= IW_QUAL_NOISE_INVALID;
+	if (sinfo.filled & STATION_INFO_RX_DROP_MISC)
+		wstats.discard.misc = sinfo.rx_dropped_misc;
+	if (sinfo.filled & STATION_INFO_TX_FAILED)
+		wstats.discard.retries = sinfo.tx_failed;
 
 	return &wstats;
 }
@@ -1420,6 +1436,9 @@ int cfg80211_wext_giwessid(struct net_device *dev,
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 
+	data->flags = 0;
+	data->length = 0;
+
 	switch (wdev->iftype) {
 	case NL80211_IFTYPE_ADHOC:
 		return cfg80211_ibss_wext_giwessid(dev, info, data, ssid);
@@ -1471,6 +1490,7 @@ int cfg80211_wext_siwpmksa(struct net_device *dev,
 		return -EOPNOTSUPP;
 	}
 }
+EXPORT_SYMBOL_GPL(cfg80211_wext_siwpmksa);
 
 static const iw_handler cfg80211_handlers[] = {
 	[IW_IOCTL_IDX(SIOCGIWNAME)]	= (iw_handler) cfg80211_wext_giwname,
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 0ef17bc42ba..dc675a3daa3 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -611,7 +611,7 @@ struct iw_statistics *get_wireless_stats(struct net_device *dev)
 #endif
 
 #ifdef CONFIG_CFG80211_WEXT
-	if (dev->ieee80211_ptr && dev->ieee80211_ptr &&
+	if (dev->ieee80211_ptr &&
 	    dev->ieee80211_ptr->wiphy &&
 	    dev->ieee80211_ptr->wiphy->wext &&
 	    dev->ieee80211_ptr->wiphy->wext->get_wireless_stats)
@@ -782,6 +782,22 @@ static int ioctl_standard_iw_point(struct iw_point *iwp, unsigned int cmd,
 		}
 	}
 
+	if (IW_IS_GET(cmd) && !(descr->flags & IW_DESCR_FLAG_NOMAX)) {
+		/*
+		 * If this is a GET, but not NOMAX, it means that the extra
+		 * data is not bounded by userspace, but by max_tokens. Thus
+		 * set the length to max_tokens. This matches the extra data
+		 * allocation.
+		 * The driver should fill it with the number of tokens it
+		 * provided, and it may check iwp->length rather than having
+		 * knowledge of max_tokens. If the driver doesn't change the
+		 * iwp->length, this ioctl just copies back max_token tokens
+		 * filled with zeroes. Hopefully the driver isn't claiming
+		 * them to be valid data.
+		 */
+		iwp->length = descr->max_tokens;
+	}
+
 	err = handler(dev, info, (union iwreq_data *) iwp, extra);
 
 	iwp->length += essid_compat;
diff --git a/net/wireless/wext-priv.c b/net/wireless/wext-priv.c
index 3feb28e41c5..674d426a9d2 100644
--- a/net/wireless/wext-priv.c
+++ b/net/wireless/wext-priv.c
@@ -152,7 +152,7 @@ static int ioctl_private_iw_point(struct iw_point *iwp, unsigned int cmd,
 	} else if (!iwp->pointer)
 		return -EFAULT;
 
-	extra = kmalloc(extra_size, GFP_KERNEL);
+	extra = kzalloc(extra_size, GFP_KERNEL);
 	if (!extra)
 		return -ENOMEM;
 
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index 9818198add8..6fffe62d7c2 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -197,6 +197,8 @@ int cfg80211_mgd_wext_siwessid(struct net_device *dev,
 	wdev->wext.connect.ssid_len = len;
 
 	wdev->wext.connect.crypto.control_port = false;
+	wdev->wext.connect.crypto.control_port_ethertype =
+					cpu_to_be16(ETH_P_PAE);
 
 	err = cfg80211_mgd_wext_connect(rdev, wdev);
  out:
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
index e6759c9660b..2196e55e4f6 100644
--- a/net/x25/Kconfig
+++ b/net/x25/Kconfig
@@ -5,6 +5,7 @@
 config X25
 	tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
+	depends on BKL # should be fixable
 	---help---
 	  X.25 is a set of standardized network protocols, similar in scope to
 	  frame relay; the one physical line from your box to the X.25 network
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 5e86d4e97dc..f7af98dff40 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -507,14 +507,14 @@ static int x25_listen(struct socket *sock, int backlog)
 	struct sock *sk = sock->sk;
 	int rc = -EOPNOTSUPP;
 
-	lock_kernel();
+	lock_sock(sk);
 	if (sk->sk_state != TCP_LISTEN) {
 		memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN);
 		sk->sk_max_ack_backlog = backlog;
 		sk->sk_state           = TCP_LISTEN;
 		rc = 0;
 	}
-	unlock_kernel();
+	release_sock(sk);
 
 	return rc;
 }
@@ -688,7 +688,6 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr;
 	int len, i, rc = 0;
 
-	lock_kernel();
 	if (!sock_flag(sk, SOCK_ZAPPED) ||
 	    addr_len != sizeof(struct sockaddr_x25) ||
 	    addr->sx25_family != AF_X25) {
@@ -704,12 +703,13 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		}
 	}
 
+	lock_sock(sk);
 	x25_sk(sk)->source_addr = addr->sx25_addr;
 	x25_insert_socket(sk);
 	sock_reset_flag(sk, SOCK_ZAPPED);
+	release_sock(sk);
 	SOCK_DEBUG(sk, "x25_bind: socket is bound\n");
 out:
-	unlock_kernel();
 	return rc;
 }
 
@@ -751,7 +751,6 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr,
 	struct x25_route *rt;
 	int rc = 0;
 
-	lock_kernel();
 	lock_sock(sk);
 	if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) {
 		sock->state = SS_CONNECTED;
@@ -829,7 +828,6 @@ out_put_route:
 	x25_route_put(rt);
 out:
 	release_sock(sk);
-	unlock_kernel();
 	return rc;
 }
 
@@ -869,8 +867,7 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags)
 	struct sk_buff *skb;
 	int rc = -EINVAL;
 
-	lock_kernel();
-	if (!sk || sk->sk_state != TCP_LISTEN)
+	if (!sk)
 		goto out;
 
 	rc = -EOPNOTSUPP;
@@ -878,6 +875,10 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags)
 		goto out;
 
 	lock_sock(sk);
+	rc = -EINVAL;
+	if (sk->sk_state != TCP_LISTEN)
+		goto out2;
+
 	rc = x25_wait_for_data(sk, sk->sk_rcvtimeo);
 	if (rc)
 		goto out2;
@@ -897,7 +898,6 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags)
 out2:
 	release_sock(sk);
 out:
-	unlock_kernel();
 	return rc;
 }
 
@@ -909,7 +909,6 @@ static int x25_getname(struct socket *sock, struct sockaddr *uaddr,
 	struct x25_sock *x25 = x25_sk(sk);
 	int rc = 0;
 
-	lock_kernel();
 	if (peer) {
 		if (sk->sk_state != TCP_ESTABLISHED) {
 			rc = -ENOTCONN;
@@ -923,19 +922,6 @@ static int x25_getname(struct socket *sock, struct sockaddr *uaddr,
 	*uaddr_len = sizeof(*sx25);
 
 out:
-	unlock_kernel();
-	return rc;
-}
-
-static unsigned int x25_datagram_poll(struct file *file, struct socket *sock,
-			   poll_table *wait)
-{
-	int rc;
-
-	lock_kernel();
-	rc = datagram_poll(file, sock, wait);
-	unlock_kernel();
-
 	return rc;
 }
 
@@ -1746,7 +1732,7 @@ static const struct proto_ops x25_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	x25_accept,
 	.getname =	x25_getname,
-	.poll =		x25_datagram_poll,
+	.poll =		datagram_poll,
 	.ioctl =	x25_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = compat_x25_ioctl,
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index a3cca0a9434..64f2ae1fdc1 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -101,7 +101,7 @@ resume:
 			err = -EHOSTUNREACH;
 			goto error_nolock;
 		}
-		skb_dst_set_noref(skb, dst);
+		skb_dst_set(skb, dst_clone(dst));
 		x = dst->xfrm;
 	} while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL));
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index a7ec5a8a238..044e7789851 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -50,6 +50,9 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
 static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
 static void xfrm_init_pmtu(struct dst_entry *dst);
 static int stale_bundle(struct dst_entry *dst);
+static int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *xdst,
+			  struct flowi *fl, int family, int strict);
+
 
 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 						int dir);
@@ -1175,9 +1178,8 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, struct flowi *fl,
 		    tmpl->mode == XFRM_MODE_BEET) {
 			remote = &tmpl->id.daddr;
 			local = &tmpl->saddr;
-			family = tmpl->encap_family;
-			if (xfrm_addr_any(local, family)) {
-				error = xfrm_get_saddr(net, &tmp, remote, family);
+			if (xfrm_addr_any(local, tmpl->encap_family)) {
+				error = xfrm_get_saddr(net, &tmp, remote, tmpl->encap_family);
 				if (error)
 					goto fail;
 				local = &tmp;
@@ -2277,7 +2279,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst)
  * still valid.
  */
 
-int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
+static int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
 		struct flowi *fl, int family, int strict)
 {
 	struct dst_entry *dst = &first->u.dst;
@@ -2359,8 +2361,6 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
 	return 1;
 }
 
-EXPORT_SYMBOL(xfrm_bundle_ok);
-
 int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
 {
 	struct net *net;
@@ -2492,7 +2492,8 @@ static int __net_init xfrm_statistics_init(struct net *net)
 	int rv;
 
 	if (snmp_mib_init((void __percpu **)net->mib.xfrm_statistics,
-			  sizeof(struct linux_xfrm_mib)) < 0)
+			  sizeof(struct linux_xfrm_mib),
+			  __alignof__(struct linux_xfrm_mib)) < 0)
 		return -ENOMEM;
 	rv = xfrm_proc_init(net);
 	if (rv < 0)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 5208b12fbfb..eb96ce52f17 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -656,15 +656,23 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
 EXPORT_SYMBOL(xfrm_sad_getinfo);
 
 static int
-xfrm_init_tempsel(struct xfrm_state *x, struct flowi *fl,
-		  struct xfrm_tmpl *tmpl,
-		  xfrm_address_t *daddr, xfrm_address_t *saddr,
-		  unsigned short family)
+xfrm_init_tempstate(struct xfrm_state *x, struct flowi *fl,
+		    struct xfrm_tmpl *tmpl,
+		    xfrm_address_t *daddr, xfrm_address_t *saddr,
+		    unsigned short family)
 {
 	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
 	if (!afinfo)
 		return -1;
-	afinfo->init_tempsel(x, fl, tmpl, daddr, saddr);
+	afinfo->init_tempsel(&x->sel, fl);
+
+	if (family != tmpl->encap_family) {
+		xfrm_state_put_afinfo(afinfo);
+		afinfo = xfrm_state_get_afinfo(tmpl->encap_family);
+		if (!afinfo)
+			return -1;
+	}
+	afinfo->init_temprop(x, tmpl, daddr, saddr);
 	xfrm_state_put_afinfo(afinfo);
 	return 0;
 }
@@ -790,37 +798,38 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 	int error = 0;
 	struct xfrm_state *best = NULL;
 	u32 mark = pol->mark.v & pol->mark.m;
+	unsigned short encap_family = tmpl->encap_family;
 
 	to_put = NULL;
 
 	spin_lock_bh(&xfrm_state_lock);
-	h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, family);
+	h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family);
 	hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) {
-		if (x->props.family == family &&
+		if (x->props.family == encap_family &&
 		    x->props.reqid == tmpl->reqid &&
 		    (mark & x->mark.m) == x->mark.v &&
 		    !(x->props.flags & XFRM_STATE_WILDRECV) &&
-		    xfrm_state_addr_check(x, daddr, saddr, family) &&
+		    xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
 		    tmpl->mode == x->props.mode &&
 		    tmpl->id.proto == x->id.proto &&
 		    (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
-			xfrm_state_look_at(pol, x, fl, family, daddr, saddr,
+			xfrm_state_look_at(pol, x, fl, encap_family, daddr, saddr,
 					   &best, &acquire_in_progress, &error);
 	}
 	if (best)
 		goto found;
 
-	h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family);
+	h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family);
 	hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h_wildcard, bydst) {
-		if (x->props.family == family &&
+		if (x->props.family == encap_family &&
 		    x->props.reqid == tmpl->reqid &&
 		    (mark & x->mark.m) == x->mark.v &&
 		    !(x->props.flags & XFRM_STATE_WILDRECV) &&
-		    xfrm_state_addr_check(x, daddr, saddr, family) &&
+		    xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
 		    tmpl->mode == x->props.mode &&
 		    tmpl->id.proto == x->id.proto &&
 		    (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
-			xfrm_state_look_at(pol, x, fl, family, daddr, saddr,
+			xfrm_state_look_at(pol, x, fl, encap_family, daddr, saddr,
 					   &best, &acquire_in_progress, &error);
 	}
 
@@ -829,7 +838,7 @@ found:
 	if (!x && !error && !acquire_in_progress) {
 		if (tmpl->id.spi &&
 		    (x0 = __xfrm_state_lookup(net, mark, daddr, tmpl->id.spi,
-					      tmpl->id.proto, family)) != NULL) {
+					      tmpl->id.proto, encap_family)) != NULL) {
 			to_put = x0;
 			error = -EEXIST;
 			goto out;
@@ -839,9 +848,9 @@ found:
 			error = -ENOMEM;
 			goto out;
 		}
-		/* Initialize temporary selector matching only
+		/* Initialize temporary state matching only
 		 * to current session. */
-		xfrm_init_tempsel(x, fl, tmpl, daddr, saddr, family);
+		xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family);
 		memcpy(&x->mark, &pol->mark, sizeof(x->mark));
 
 		error = security_xfrm_state_alloc_acquire(x, pol->security, fl->secid);
@@ -856,10 +865,10 @@ found:
 			x->km.state = XFRM_STATE_ACQ;
 			list_add(&x->km.all, &net->xfrm.state_all);
 			hlist_add_head(&x->bydst, net->xfrm.state_bydst+h);
-			h = xfrm_src_hash(net, daddr, saddr, family);
+			h = xfrm_src_hash(net, daddr, saddr, encap_family);
 			hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h);
 			if (x->id.spi) {
-				h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, family);
+				h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family);
 				hlist_add_head(&x->byspi, net->xfrm.state_byspi+h);
 			}
 			x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index ba59983aaff..8bae6b22c84 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1801,7 +1801,7 @@ static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
 	struct xfrm_user_expire *ue = nlmsg_data(nlh);
 	struct xfrm_usersa_info *p = &ue->state;
 	struct xfrm_mark m;
-	u32 mark = xfrm_mark_get(attrs, &m);;
+	u32 mark = xfrm_mark_get(attrs, &m);
 
 	x = xfrm_state_lookup(net, mark, &p->id.daddr, p->id.spi, p->id.proto, p->family);
 
@@ -2504,7 +2504,7 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt,
 	if (p->dir > XFRM_POLICY_OUT)
 		return NULL;
 
-	xp = xfrm_policy_alloc(net, GFP_KERNEL);
+	xp = xfrm_policy_alloc(net, GFP_ATOMIC);
 	if (xp == NULL) {
 		*dir = -ENOBUFS;
 		return NULL;