aboutsummaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/datagram.c116
-rw-r--r--net/core/dev.c400
-rw-r--r--net/core/drop_monitor.c137
-rw-r--r--net/core/fib_rules.c4
-rw-r--r--net/core/iovec.c33
-rw-r--r--net/core/net-sysfs.c8
-rw-r--r--net/core/net-traces.c4
-rw-r--r--net/core/net_namespace.c54
-rw-r--r--net/core/netpoll.c2
-rw-r--r--net/core/skbuff.c17
-rw-r--r--net/core/stream.c3
11 files changed, 655 insertions, 123 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c
index b01a76abe1d..e2a36f05cdf 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -260,7 +260,9 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
spin_unlock_bh(&sk->sk_receive_queue.lock);
}
- skb_free_datagram(sk, skb);
+ kfree_skb(skb);
+ sk_mem_reclaim_partial(sk);
+
return err;
}
@@ -351,17 +353,111 @@ fault:
}
/**
+ * skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying from
+ * @to: io vector to copy to
+ * @to_offset: offset in the io vector to start copying to
+ * @len: amount of data to copy from buffer to iovec
+ *
+ * Returns 0 or -EFAULT.
+ * Note: the iovec is not modified during the copy.
+ */
+int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
+ const struct iovec *to, int to_offset,
+ int len)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+
+ /* Copy header. */
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to_offset += copy;
+ }
+
+ /* Copy paged appendix. Hmm... why does this look so complicated? */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_shinfo(skb)->frags[i].size;
+ if ((copy = end - offset) > 0) {
+ int err;
+ u8 *vaddr;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page *page = frag->page;
+
+ if (copy > len)
+ copy = len;
+ vaddr = kmap(page);
+ err = memcpy_toiovecend(to, vaddr + frag->page_offset +
+ offset - start, to_offset, copy);
+ kunmap(page);
+ if (err)
+ goto fault;
+ if (!(len -= copy))
+ return 0;
+ offset += copy;
+ to_offset += copy;
+ }
+ start = end;
+ }
+
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+ for (; list; list = list->next) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + list->len;
+ if ((copy = end - offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_copy_datagram_const_iovec(list,
+ offset - start,
+ to, to_offset,
+ copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ to_offset += copy;
+ }
+ start = end;
+ }
+ }
+ if (!len)
+ return 0;
+
+fault:
+ return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
+
+/**
* skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying to
* @from: io vector to copy to
+ * @from_offset: offset in the io vector to start copying from
* @len: amount of data to copy to buffer from iovec
*
* Returns 0 or -EFAULT.
- * Note: the iovec is modified during the copy.
+ * Note: the iovec is not modified during the copy.
*/
int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
- struct iovec *from, int len)
+ const struct iovec *from, int from_offset,
+ int len)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
@@ -370,11 +466,12 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
if (copy > 0) {
if (copy > len)
copy = len;
- if (memcpy_fromiovec(skb->data + offset, from, copy))
+ if (memcpy_fromiovecend(skb->data + offset, from, 0, copy))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
+ from_offset += copy;
}
/* Copy paged appendix. Hmm... why does this look so complicated? */
@@ -393,8 +490,9 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
vaddr = kmap(page);
- err = memcpy_fromiovec(vaddr + frag->page_offset +
- offset - start, from, copy);
+ err = memcpy_fromiovecend(vaddr + frag->page_offset +
+ offset - start,
+ from, from_offset, copy);
kunmap(page);
if (err)
goto fault;
@@ -402,6 +500,7 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
if (!(len -= copy))
return 0;
offset += copy;
+ from_offset += copy;
}
start = end;
}
@@ -420,11 +519,14 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
copy = len;
if (skb_copy_datagram_from_iovec(list,
offset - start,
- from, copy))
+ from,
+ from_offset,
+ copy))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
+ from_offset += copy;
}
start = end;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index e2e9e4af3ac..3942266d1f6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -126,6 +126,7 @@
#include <linux/in.h>
#include <linux/jhash.h>
#include <linux/random.h>
+#include <trace/napi.h>
#include "net-sysfs.h"
@@ -1688,6 +1689,14 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
goto gso;
}
+ /*
+ * If device doesnt need skb->dst, release it right now while
+ * its hot in this cpu cache
+ */
+ if ((dev->priv_flags & IFF_XMIT_DST_RELEASE) && skb->dst) {
+ dst_release(skb->dst);
+ skb->dst = NULL;
+ }
rc = ops->ndo_start_xmit(skb, dev);
/*
* TODO: if skb_orphan() was called by
@@ -1735,8 +1744,12 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
{
u32 hash;
- if (skb_rx_queue_recorded(skb))
- return skb_get_rx_queue(skb) % dev->real_num_tx_queues;
+ if (skb_rx_queue_recorded(skb)) {
+ hash = skb_get_rx_queue(skb);
+ while (unlikely (hash >= dev->real_num_tx_queues))
+ hash -= dev->real_num_tx_queues;
+ return hash;
+ }
if (skb->sk && skb->sk->sk_hash)
hash = skb->sk->sk_hash;
@@ -2379,18 +2392,13 @@ void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
unsigned int offset = skb_gro_offset(skb);
hlen += offset;
- if (hlen <= skb_headlen(skb))
- return skb->data + offset;
-
- if (unlikely(!skb_shinfo(skb)->nr_frags ||
- skb_shinfo(skb)->frags[0].size <=
- hlen - skb_headlen(skb) ||
+ if (unlikely(skb_headlen(skb) ||
+ skb_shinfo(skb)->frags[0].size < hlen ||
PageHighMem(skb_shinfo(skb)->frags[0].page)))
return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
return page_address(skb_shinfo(skb)->frags[0].page) +
- skb_shinfo(skb)->frags[0].page_offset +
- offset - skb_headlen(skb);
+ skb_shinfo(skb)->frags[0].page_offset + offset;
}
EXPORT_SYMBOL(skb_gro_header);
@@ -2526,16 +2534,10 @@ void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
}
EXPORT_SYMBOL(napi_reuse_skb);
-struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
- struct napi_gro_fraginfo *info)
+struct sk_buff *napi_get_frags(struct napi_struct *napi)
{
struct net_device *dev = napi->dev;
struct sk_buff *skb = napi->skb;
- struct ethhdr *eth;
- skb_frag_t *frag;
- int i;
-
- napi->skb = NULL;
if (!skb) {
skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
@@ -2543,47 +2545,14 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
goto out;
skb_reserve(skb, NET_IP_ALIGN);
- }
-
- BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
- frag = info->frags;
-
- for (i = 0; i < info->nr_frags; i++) {
- skb_fill_page_desc(skb, i, frag->page, frag->page_offset,
- frag->size);
- frag++;
- }
- skb_shinfo(skb)->nr_frags = info->nr_frags;
-
- skb->data_len = info->len;
- skb->len += info->len;
- skb->truesize += info->len;
-
- skb_reset_mac_header(skb);
- skb_gro_reset_offset(skb);
- eth = skb_gro_header(skb, sizeof(*eth));
- if (!eth) {
- napi_reuse_skb(napi, skb);
- skb = NULL;
- goto out;
+ napi->skb = skb;
}
- skb_gro_pull(skb, sizeof(*eth));
-
- /*
- * This works because the only protocols we care about don't require
- * special handling. We'll fix it up properly at the end.
- */
- skb->protocol = eth->h_proto;
-
- skb->ip_summed = info->ip_summed;
- skb->csum = info->csum;
-
out:
return skb;
}
-EXPORT_SYMBOL(napi_fraginfo_skb);
+EXPORT_SYMBOL(napi_get_frags);
int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
{
@@ -2613,9 +2582,39 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
}
EXPORT_SYMBOL(napi_frags_finish);
-int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
+struct sk_buff *napi_frags_skb(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi->skb;
+ struct ethhdr *eth;
+
+ napi->skb = NULL;
+
+ skb_reset_mac_header(skb);
+ skb_gro_reset_offset(skb);
+
+ eth = skb_gro_header(skb, sizeof(*eth));
+ if (!eth) {
+ napi_reuse_skb(napi, skb);
+ skb = NULL;
+ goto out;
+ }
+
+ skb_gro_pull(skb, sizeof(*eth));
+
+ /*
+ * This works because the only protocols we care about don't require
+ * special handling. We'll fix it up properly at the end.
+ */
+ skb->protocol = eth->h_proto;
+
+out:
+ return skb;
+}
+EXPORT_SYMBOL(napi_frags_skb);
+
+int napi_gro_frags(struct napi_struct *napi)
{
- struct sk_buff *skb = napi_fraginfo_skb(napi, info);
+ struct sk_buff *skb = napi_frags_skb(napi);
if (!skb)
return NET_RX_DROP;
@@ -2719,7 +2718,7 @@ void netif_napi_del(struct napi_struct *napi)
struct sk_buff *skb, *next;
list_del_init(&napi->dev_list);
- kfree_skb(napi->skb);
+ napi_free_frags(napi);
for (skb = napi->gro_list; skb; skb = next) {
next = skb->next;
@@ -2773,8 +2772,10 @@ static void net_rx_action(struct softirq_action *h)
* accidently calling ->poll() when NAPI is not scheduled.
*/
work = 0;
- if (test_bit(NAPI_STATE_SCHED, &n->state))
+ if (test_bit(NAPI_STATE_SCHED, &n->state)) {
work = n->poll(n, weight);
+ trace_napi_poll(n);
+ }
WARN_ON_ONCE(work > weight);
@@ -3444,6 +3445,252 @@ void dev_set_rx_mode(struct net_device *dev)
netif_addr_unlock_bh(dev);
}
+/* hw addresses list handling functions */
+
+static int __hw_addr_add(struct list_head *list, unsigned char *addr,
+ int addr_len, unsigned char addr_type)
+{
+ struct netdev_hw_addr *ha;
+ int alloc_size;
+
+ if (addr_len > MAX_ADDR_LEN)
+ return -EINVAL;
+
+ alloc_size = sizeof(*ha);
+ if (alloc_size < L1_CACHE_BYTES)
+ alloc_size = L1_CACHE_BYTES;
+ ha = kmalloc(alloc_size, GFP_ATOMIC);
+ if (!ha)
+ return -ENOMEM;
+ memcpy(ha->addr, addr, addr_len);
+ ha->type = addr_type;
+ list_add_tail_rcu(&ha->list, list);
+ return 0;
+}
+
+static void ha_rcu_free(struct rcu_head *head)
+{
+ struct netdev_hw_addr *ha;
+
+ ha = container_of(head, struct netdev_hw_addr, rcu_head);
+ kfree(ha);
+}
+
+static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr,
+ int addr_len, unsigned char addr_type,
+ int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+ int i = 0;
+
+ list_for_each_entry(ha, list, list) {
+ if (i++ != ignore_index &&
+ !memcmp(ha->addr, addr, addr_len) &&
+ (ha->type == addr_type || !addr_type)) {
+ list_del_rcu(&ha->list);
+ call_rcu(&ha->rcu_head, ha_rcu_free);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static int __hw_addr_add_multiple_ii(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len, unsigned char addr_type,
+ int ignore_index)
+{
+ int err;
+ struct netdev_hw_addr *ha, *ha2;
+ unsigned char type;
+
+ list_for_each_entry(ha, from_list, list) {
+ type = addr_type ? addr_type : ha->type;
+ err = __hw_addr_add(to_list, ha->addr, addr_len, type);
+ if (err)
+ goto unroll;
+ }
+ return 0;
+
+unroll:
+ list_for_each_entry(ha2, from_list, list) {
+ if (ha2 == ha)
+ break;
+ type = addr_type ? addr_type : ha2->type;
+ __hw_addr_del_ii(to_list, ha2->addr, addr_len, type,
+ ignore_index);
+ }
+ return err;
+}
+
+static void __hw_addr_del_multiple_ii(struct list_head *to_list,
+ struct list_head *from_list,
+ int addr_len, unsigned char addr_type,
+ int ignore_index)
+{
+ struct netdev_hw_addr *ha;
+ unsigned char type;
+
+ list_for_each_entry(ha, from_list, list) {
+ type = addr_type ? addr_type : ha->type;
+ __hw_addr_del_ii(to_list, ha->addr, addr_len, addr_type,
+ ignore_index);
+ }
+}
+
+static void __hw_addr_flush(struct list_head *list)
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, list, list) {
+ list_del_rcu(&ha->list);
+ call_rcu(&ha->rcu_head, ha_rcu_free);
+ }
+}
+
+/* Device addresses handling functions */
+
+static void dev_addr_flush(struct net_device *dev)
+{
+ /* rtnl_mutex must be held here */
+
+ __hw_addr_flush(&dev->dev_addr_list);
+ dev->dev_addr = NULL;
+}
+
+static int dev_addr_init(struct net_device *dev)
+{
+ unsigned char addr[MAX_ADDR_LEN];
+ struct netdev_hw_addr *ha;
+ int err;
+
+ /* rtnl_mutex must be held here */
+
+ INIT_LIST_HEAD(&dev->dev_addr_list);
+ memset(addr, 0, sizeof(*addr));
+ err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr),
+ NETDEV_HW_ADDR_T_LAN);
+ if (!err) {
+ /*
+ * Get the first (previously created) address from the list
+ * and set dev_addr pointer to this location.
+ */
+ ha = list_first_entry(&dev->dev_addr_list,
+ struct netdev_hw_addr, list);
+ dev->dev_addr = ha->addr;
+ }
+ return err;
+}
+
+/**
+ * dev_addr_add - Add a device address
+ * @dev: device
+ * @addr: address to add
+ * @addr_type: address type
+ *
+ * Add a device address to the device or increase the reference count if
+ * it already exists.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add(struct net_device *dev, unsigned char *addr,
+ unsigned char addr_type)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_add(&dev->dev_addr_list, addr, dev->addr_len,
+ addr_type);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add);
+
+/**
+ * dev_addr_del - Release a device address.
+ * @dev: device
+ * @addr: address to delete
+ * @addr_type: address type
+ *
+ * Release reference to a device address and remove it from the device
+ * if the reference count drops to zero.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del(struct net_device *dev, unsigned char *addr,
+ unsigned char addr_type)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len,
+ addr_type, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_del);
+
+/**
+ * dev_addr_add_multiple - Add device addresses from another device
+ * @to_dev: device to which addresses will be added
+ * @from_dev: device from which addresses will be added
+ * @addr_type: address type - 0 means type will be used from from_dev
+ *
+ * Add device addresses of the one device to another.
+ **
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_add_multiple(struct net_device *to_dev,
+ struct net_device *from_dev,
+ unsigned char addr_type)
+{
+ int err;
+
+ ASSERT_RTNL();
+
+ if (from_dev->addr_len != to_dev->addr_len)
+ return -EINVAL;
+ err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list,
+ &from_dev->dev_addr_list,
+ to_dev->addr_len, addr_type, 0);
+ if (!err)
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+ return err;
+}
+EXPORT_SYMBOL(dev_addr_add_multiple);
+
+/**
+ * dev_addr_del_multiple - Delete device addresses by another device
+ * @to_dev: device where the addresses will be deleted
+ * @from_dev: device by which addresses the addresses will be deleted
+ * @addr_type: address type - 0 means type will used from from_dev
+ *
+ * Deletes addresses in to device by the list of addresses in from device.
+ *
+ * The caller must hold the rtnl_mutex.
+ */
+int dev_addr_del_multiple(struct net_device *to_dev,
+ struct net_device *from_dev,
+ unsigned char addr_type)
+{
+ ASSERT_RTNL();
+
+ if (from_dev->addr_len != to_dev->addr_len)
+ return -EINVAL;
+ __hw_addr_del_multiple_ii(&to_dev->dev_addr_list,
+ &from_dev->dev_addr_list,
+ to_dev->addr_len, addr_type, 0);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
+ return 0;
+}
+EXPORT_SYMBOL(dev_addr_del_multiple);
+
+/* unicast and multicast addresses handling functions */
+
int __dev_addr_delete(struct dev_addr_list **list, int *count,
void *addr, int alen, int glbl)
{
@@ -4707,13 +4954,30 @@ void netdev_run_todo(void)
* the internal statistics structure is used.
*/
const struct net_device_stats *dev_get_stats(struct net_device *dev)
- {
+{
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_get_stats)
return ops->ndo_get_stats(dev);
- else
- return &dev->stats;
+ else {
+ unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
+ struct net_device_stats *stats = &dev->stats;
+ unsigned int i;
+ struct netdev_queue *txq;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ txq = netdev_get_tx_queue(dev, i);
+ tx_bytes += txq->tx_bytes;
+ tx_packets += txq->tx_packets;
+ tx_dropped += txq->tx_dropped;
+ }
+ if (tx_bytes || tx_packets || tx_dropped) {
+ stats->tx_bytes = tx_bytes;
+ stats->tx_packets = tx_packets;
+ stats->tx_dropped = tx_dropped;
+ }
+ return stats;
+ }
}
EXPORT_SYMBOL(dev_get_stats);
@@ -4771,13 +5035,16 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
if (!tx) {
printk(KERN_ERR "alloc_netdev: Unable to allocate "
"tx qdiscs.\n");
- kfree(p);
- return NULL;
+ goto free_p;
}
dev = (struct net_device *)
(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
dev->padded = (char *)dev - (char *)p;
+
+ if (dev_addr_init(dev))
+ goto free_tx;
+
dev_net_set(dev, &init_net);
dev->_tx = tx;
@@ -4789,9 +5056,17 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
netdev_init_queues(dev);
INIT_LIST_HEAD(&dev->napi_list);
+ dev->priv_flags = IFF_XMIT_DST_RELEASE;
setup(dev);
strcpy(dev->name, name);
return dev;
+
+free_tx:
+ kfree(tx);
+
+free_p:
+ kfree(p);
+ return NULL;
}
EXPORT_SYMBOL(alloc_netdev_mq);
@@ -4811,6 +5086,9 @@ void free_netdev(struct net_device *dev)
kfree(dev->_tx);
+ /* Flush device addresses */
+ dev_addr_flush(dev);
+
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 9fd0dc3cca9..a6c2ac2828f 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -22,8 +22,10 @@
#include <linux/timer.h>
#include <linux/bitops.h>
#include <net/genetlink.h>
+#include <net/netevent.h>
#include <trace/skb.h>
+#include <trace/napi.h>
#include <asm/unaligned.h>
@@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused);
* and the work handle that will send up
* netlink alerts
*/
-struct sock *dm_sock;
+static int trace_state = TRACE_OFF;
+static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED;
struct per_cpu_dm_data {
struct work_struct dm_alert_work;
@@ -47,11 +50,18 @@ struct per_cpu_dm_data {
struct timer_list send_timer;
};
+struct dm_hw_stat_delta {
+ struct net_device *dev;
+ struct list_head list;
+ struct rcu_head rcu;
+ unsigned long last_drop_val;
+};
+
static struct genl_family net_drop_monitor_family = {
.id = GENL_ID_GENERATE,
.hdrsize = 0,
.name = "NET_DM",
- .version = 1,
+ .version = 2,
.maxattr = NET_DM_CMD_MAX,
};
@@ -59,19 +69,24 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
static int dm_hit_limit = 64;
static int dm_delay = 1;
-
+static unsigned long dm_hw_check_delta = 2*HZ;
+static LIST_HEAD(hw_stats_list);
static void reset_per_cpu_data(struct per_cpu_dm_data *data)
{
size_t al;
struct net_dm_alert_msg *msg;
+ struct nlattr *nla;
al = sizeof(struct net_dm_alert_msg);
al += dm_hit_limit * sizeof(struct net_dm_drop_point);
+ al += sizeof(struct nlattr);
+
data->skb = genlmsg_new(al, GFP_KERNEL);
genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family,
0, NET_DM_CMD_ALERT);
- msg = __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_alert_msg));
+ nla = nla_reserve(data->skb, NLA_UNSPEC, sizeof(struct net_dm_alert_msg));
+ msg = nla_data(nla);
memset(msg, 0, al);
atomic_set(&data->dm_hit_count, dm_hit_limit);
}
@@ -111,10 +126,11 @@ static void sched_send_work(unsigned long unused)
schedule_work(&data->dm_alert_work);
}
-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+static void trace_drop_common(struct sk_buff *skb, void *location)
{
struct net_dm_alert_msg *msg;
struct nlmsghdr *nlh;
+ struct nlattr *nla;
int i;
struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
@@ -127,7 +143,8 @@ static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
}
nlh = (struct nlmsghdr *)data->skb->data;
- msg = genlmsg_data(nlmsg_data(nlh));
+ nla = genlmsg_data(nlmsg_data(nlh));
+ msg = nla_data(nla);
for (i = 0; i < msg->entries; i++) {
if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
msg->points[i].count++;
@@ -139,6 +156,7 @@ static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
* We need to create a new entry
*/
__nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point));
+ nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
msg->points[msg->entries].count = 1;
msg->entries++;
@@ -152,24 +170,80 @@ out:
return;
}
+static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+{
+ trace_drop_common(skb, location);
+}
+
+static void trace_napi_poll_hit(struct napi_struct *napi)
+{
+ struct dm_hw_stat_delta *new_stat;
+
+ /*
+ * Ratelimit our check time to dm_hw_check_delta jiffies
+ */
+ if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta))
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+ if ((new_stat->dev == napi->dev) &&
+ (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+ trace_drop_common(NULL, NULL);
+ new_stat->last_drop_val = napi->dev->stats.rx_dropped;
+ break;
+ }
+ }
+ rcu_read_unlock();
+}
+
+
+static void free_dm_hw_stat(struct rcu_head *head)
+{
+ struct dm_hw_stat_delta *n;
+ n = container_of(head, struct dm_hw_stat_delta, rcu);
+ kfree(n);
+}
+
static int set_all_monitor_traces(int state)
{
int rc = 0;
+ struct dm_hw_stat_delta *new_stat = NULL;
+ struct dm_hw_stat_delta *temp;
+
+ spin_lock(&trace_state_lock);
switch (state) {
case TRACE_ON:
rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
+ rc |= register_trace_napi_poll(trace_napi_poll_hit);
break;
case TRACE_OFF:
rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
+ rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
tracepoint_synchronize_unregister();
+
+ /*
+ * Clean the device list
+ */
+ list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
+ if (new_stat->dev == NULL) {
+ list_del_rcu(&new_stat->list);
+ call_rcu(&new_stat->rcu, free_dm_hw_stat);
+ }
+ }
break;
default:
rc = 1;
break;
}
+ if (!rc)
+ trace_state = state;
+
+ spin_unlock(&trace_state_lock);
+
if (rc)
return -EINPROGRESS;
return rc;
@@ -197,6 +271,44 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
return -ENOTSUPP;
}
+static int dropmon_net_event(struct notifier_block *ev_block,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct dm_hw_stat_delta *new_stat = NULL;
+ struct dm_hw_stat_delta *tmp;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+
+ if (!new_stat)
+ goto out;
+
+ new_stat->dev = dev;
+ INIT_RCU_HEAD(&new_stat->rcu);
+ spin_lock(&trace_state_lock);
+ list_add_rcu(&new_stat->list, &hw_stats_list);
+ spin_unlock(&trace_state_lock);
+ break;
+ case NETDEV_UNREGISTER:
+ spin_lock(&trace_state_lock);
+ list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
+ if (new_stat->dev == dev) {
+ new_stat->dev = NULL;
+ if (trace_state == TRACE_OFF) {
+ list_del_rcu(&new_stat->list);
+ call_rcu(&new_stat->rcu, free_dm_hw_stat);
+ break;
+ }
+ }
+ }
+ spin_unlock(&trace_state_lock);
+ break;
+ }
+out:
+ return NOTIFY_DONE;
+}
static struct genl_ops dropmon_ops[] = {
{
@@ -213,6 +325,10 @@ static struct genl_ops dropmon_ops[] = {
},
};
+static struct notifier_block dropmon_net_notifier = {
+ .notifier_call = dropmon_net_event
+};
+
static int __init init_net_drop_monitor(void)
{
int cpu;
@@ -236,12 +352,18 @@ static int __init init_net_drop_monitor(void)
ret = genl_register_ops(&net_drop_monitor_family,
&dropmon_ops[i]);
if (ret) {
- printk(KERN_CRIT "failed to register operation %d\n",
+ printk(KERN_CRIT "Failed to register operation %d\n",
dropmon_ops[i].cmd);
goto out_unreg;
}
}
+ rc = register_netdevice_notifier(&dropmon_net_notifier);
+ if (rc < 0) {
+ printk(KERN_CRIT "Failed to register netdevice notifier\n");
+ goto out_unreg;
+ }
+
rc = 0;
for_each_present_cpu(cpu) {
@@ -252,6 +374,7 @@ static int __init init_net_drop_monitor(void)
data->send_timer.data = cpu;
data->send_timer.function = sched_send_work;
}
+
goto out;
out_unreg:
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 98691e1466b..bd309384f8b 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -299,7 +299,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
} else if (rule->action == FR_ACT_GOTO)
goto errout_free;
- err = ops->configure(rule, skb, nlh, frh, tb);
+ err = ops->configure(rule, skb, frh, tb);
if (err < 0)
goto errout_free;
@@ -500,7 +500,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
if (rule->target)
NLA_PUT_U32(skb, FRA_GOTO, rule->target);
- if (ops->fill(rule, skb, nlh, frh) < 0)
+ if (ops->fill(rule, skb, frh) < 0)
goto nla_put_failure;
return nlmsg_end(skb, nlh);
diff --git a/net/core/iovec.c b/net/core/iovec.c
index 4c9c0121c9d..40a76ce19d9 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -98,6 +98,31 @@ int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
}
/*
+ * Copy kernel to iovec. Returns -EFAULT on error.
+ */
+
+int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
+ int offset, int len)
+{
+ int copy;
+ for (; len > 0; ++iov) {
+ /* Skip over the finished iovecs */
+ if (unlikely(offset >= iov->iov_len)) {
+ offset -= iov->iov_len;
+ continue;
+ }
+ copy = min_t(unsigned int, iov->iov_len - offset, len);
+ offset = 0;
+ if (copy_to_user(iov->iov_base, kdata, copy))
+ return -EFAULT;
+ kdata += copy;
+ len -= copy;
+ }
+
+ return 0;
+}
+
+/*
* Copy iovec to kernel. Returns -EFAULT on error.
*
* Note: this modifies the original iovec.
@@ -122,10 +147,11 @@ int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
}
/*
- * For use with ip_build_xmit
+ * Copy iovec from kernel. Returns -EFAULT on error.
*/
-int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset,
- int len)
+
+int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
+ int offset, int len)
{
/* Skip over the finished iovecs */
while (offset >= iov->iov_len) {
@@ -236,3 +262,4 @@ EXPORT_SYMBOL(csum_partial_copy_fromiovecend);
EXPORT_SYMBOL(memcpy_fromiovec);
EXPORT_SYMBOL(memcpy_fromiovecend);
EXPORT_SYMBOL(memcpy_toiovec);
+EXPORT_SYMBOL(memcpy_toiovecend);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 2da59a0ac4a..b9641e816ee 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -78,7 +78,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
goto err;
if (!rtnl_trylock())
- return -ERESTARTSYS;
+ return restart_syscall();
if (dev_isalive(net)) {
if ((ret = (*set)(net, new)) == 0)
@@ -225,7 +225,8 @@ static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
if (len > 0 && buf[len - 1] == '\n')
--count;
- rtnl_lock();
+ if (!rtnl_trylock())
+ return restart_syscall();
ret = dev_set_alias(netdev, buf, count);
rtnl_unlock();
@@ -238,7 +239,8 @@ static ssize_t show_ifalias(struct device *dev,
const struct net_device *netdev = to_net_dev(dev);
ssize_t ret = 0;
- rtnl_lock();
+ if (!rtnl_trylock())
+ return restart_syscall();
if (netdev->ifalias)
ret = sprintf(buf, "%s\n", netdev->ifalias);
rtnl_unlock();
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index c8fb45665e4..b07b25bd2cd 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -20,6 +20,7 @@
#include <linux/netlink.h>
#include <linux/net_dropmon.h>
#include <trace/skb.h>
+#include <trace/napi.h>
#include <asm/unaligned.h>
#include <asm/bitops.h>
@@ -27,3 +28,6 @@
DEFINE_TRACE(kfree_skb);
EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
+
+DEFINE_TRACE(napi_poll);
+EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index e3bebd36f05..b7292a2719d 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -115,41 +115,34 @@ static void net_free(struct net *net)
kmem_cache_free(net_cachep, net);
}
-struct net *copy_net_ns(unsigned long flags, struct net *old_net)
+static struct net *net_create(void)
{
- struct net *new_net = NULL;
- int err;
-
- get_net(old_net);
-
- if (!(flags & CLONE_NEWNET))
- return old_net;
-
- err = -ENOMEM;
- new_net = net_alloc();
- if (!new_net)
- goto out_err;
+ struct net *net;
+ int rv;
+ net = net_alloc();
+ if (!net)
+ return ERR_PTR(-ENOMEM);
mutex_lock(&net_mutex);
- err = setup_net(new_net);
- if (!err) {
+ rv = setup_net(net);
+ if (rv == 0) {
rtnl_lock();
- list_add_tail(&new_net->list, &net_namespace_list);
+ list_add_tail(&net->list, &net_namespace_list);
rtnl_unlock();
}
mutex_unlock(&net_mutex);
+ if (rv < 0) {
+ net_free(net);
+ return ERR_PTR(rv);
+ }
+ return net;
+}
- if (err)
- goto out_free;
-out:
- put_net(old_net);
- return new_net;
-
-out_free:
- net_free(new_net);
-out_err:
- new_net = ERR_PTR(err);
- goto out;
+struct net *copy_net_ns(unsigned long flags, struct net *old_net)
+{
+ if (!(flags & CLONE_NEWNET))
+ return get_net(old_net);
+ return net_create();
}
static void cleanup_net(struct work_struct *work)
@@ -203,9 +196,7 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
static int __init net_ns_init(void)
{
struct net_generic *ng;
- int err;
- printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net));
#ifdef CONFIG_NET_NS
net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
SMP_CACHE_BYTES,
@@ -224,15 +215,14 @@ static int __init net_ns_init(void)
rcu_assign_pointer(init_net.gen, ng);
mutex_lock(&net_mutex);
- err = setup_net(&init_net);
+ if (setup_net(&init_net))
+ panic("Could not setup the initial network namespace");
rtnl_lock();
list_add_tail(&init_net.list, &net_namespace_list);
rtnl_unlock();
mutex_unlock(&net_mutex);
- if (err)
- panic("Could not setup the initial network namespace");
return 0;
}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 64f51eec657..67b4f3e3d4a 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -24,6 +24,7 @@
#include <net/tcp.h>
#include <net/udp.h>
#include <asm/unaligned.h>
+#include <trace/napi.h>
/*
* We maintain a small pool of fully-sized skbs, to make sure the
@@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo,
set_bit(NAPI_STATE_NPSVC, &napi->state);
work = napi->poll(napi, budget);
+ trace_napi_poll(napi);
clear_bit(NAPI_STATE_NPSVC, &napi->state);
atomic_dec(&trapped);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e505b5392e1..d429c41e0dc 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -526,8 +526,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->sp = secpath_get(old->sp);
#endif
memcpy(new->cb, old->cb, sizeof(old->cb));
- new->csum_start = old->csum_start;
- new->csum_offset = old->csum_offset;
+ new->csum = old->csum;
new->local_df = old->local_df;
new->pkt_type = old->pkt_type;
new->ip_summed = old->ip_summed;
@@ -538,6 +537,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#endif
new->protocol = old->protocol;
new->mark = old->mark;
+ new->iif = old->iif;
__nf_copy(new, old);
#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
@@ -550,10 +550,18 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#endif
#endif
new->vlan_tci = old->vlan_tci;
+#if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE)
+ new->do_not_encrypt = old->do_not_encrypt;
+ new->requeue = old->requeue;
+#endif
skb_copy_secmark(new, old);
}
+/*
+ * You should not add any new code to this function. Add it to
+ * __copy_skb_header above instead.
+ */
static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
{
#define C(x) n->x = skb->x
@@ -569,16 +577,11 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
n->cloned = 1;
n->nohdr = 0;
n->destructor = NULL;
- C(iif);
C(tail);
C(end);
C(head);
C(data);
C(truesize);
-#if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE)
- C(do_not_encrypt);
- C(requeue);
-#endif
atomic_set(&n->users, 1);
atomic_inc(&(skb_shinfo(skb)->dataref));
diff --git a/net/core/stream.c b/net/core/stream.c
index 8727cead64a..a37debfeb1b 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -33,7 +33,8 @@ void sk_stream_write_space(struct sock *sk)
clear_bit(SOCK_NOSPACE, &sock->flags);
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
+ wake_up_interruptible_poll(sk->sk_sleep, POLLOUT |
+ POLLWRNORM | POLLWRBAND);
if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
}