aboutsummaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/core/dev.c112
-rw-r--r--net/core/skbuff.c6
-rw-r--r--net/core/sock.c3
-rw-r--r--net/ipv4/icmp.c30
-rw-r--r--net/ipv4/ip_output.c8
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/mac80211/rx.c2
-rw-r--r--net/netfilter/core.c6
-rw-r--r--net/packet/af_packet.c5
-rw-r--r--net/rds/ib_rdma.c3
-rw-r--r--net/sched/sch_generic.c2
11 files changed, 144 insertions, 40 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 37bddf729e77..3d2b58d65add 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -181,6 +181,7 @@ static unsigned int napi_gen_id;
static DEFINE_HASHTABLE(napi_hash, 8);
static seqcount_t devnet_rename_seq;
+static DEFINE_MUTEX(devnet_rename_mutex);
static inline void dev_base_seq_inc(struct net *net)
{
@@ -202,14 +203,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
static inline void rps_lock(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
- spin_lock(&sd->input_pkt_queue.lock);
+ raw_spin_lock(&sd->input_pkt_queue.raw_lock);
#endif
}
static inline void rps_unlock(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
- spin_unlock(&sd->input_pkt_queue.lock);
+ raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
#endif
}
@@ -831,7 +832,8 @@ retry:
strcpy(name, dev->name);
rcu_read_unlock();
if (read_seqcount_retry(&devnet_rename_seq, seq)) {
- cond_resched();
+ mutex_lock(&devnet_rename_mutex);
+ mutex_unlock(&devnet_rename_mutex);
goto retry;
}
@@ -1097,30 +1099,28 @@ int dev_change_name(struct net_device *dev, const char *newname)
if (dev->flags & IFF_UP)
return -EBUSY;
- write_seqcount_begin(&devnet_rename_seq);
+ mutex_lock(&devnet_rename_mutex);
+ __raw_write_seqcount_begin(&devnet_rename_seq);
- if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
- write_seqcount_end(&devnet_rename_seq);
- return 0;
- }
+ if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
+ goto outunlock;
memcpy(oldname, dev->name, IFNAMSIZ);
err = dev_get_valid_name(net, dev, newname);
- if (err < 0) {
- write_seqcount_end(&devnet_rename_seq);
- return err;
- }
+ if (err < 0)
+ goto outunlock;
rollback:
ret = device_rename(&dev->dev, dev->name);
if (ret) {
memcpy(dev->name, oldname, IFNAMSIZ);
- write_seqcount_end(&devnet_rename_seq);
- return ret;
+ err = ret;
+ goto outunlock;
}
- write_seqcount_end(&devnet_rename_seq);
+ __raw_write_seqcount_end(&devnet_rename_seq);
+ mutex_unlock(&devnet_rename_mutex);
netdev_adjacent_rename_links(dev, oldname);
@@ -1141,7 +1141,8 @@ rollback:
/* err >= 0 after dev_alloc_name() or stores the first errno */
if (err >= 0) {
err = ret;
- write_seqcount_begin(&devnet_rename_seq);
+ mutex_lock(&devnet_rename_mutex);
+ __raw_write_seqcount_begin(&devnet_rename_seq);
memcpy(dev->name, oldname, IFNAMSIZ);
memcpy(oldname, newname, IFNAMSIZ);
goto rollback;
@@ -1152,6 +1153,11 @@ rollback:
}
return err;
+
+outunlock:
+ __raw_write_seqcount_end(&devnet_rename_seq);
+ mutex_unlock(&devnet_rename_mutex);
+ return err;
}
/**
@@ -2147,6 +2153,7 @@ static inline void __netif_reschedule(struct Qdisc *q)
sd->output_queue_tailp = &q->next_sched;
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
+ preempt_check_resched_rt();
}
void __netif_schedule(struct Qdisc *q)
@@ -2181,6 +2188,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
__this_cpu_write(softnet_data.completion_queue, skb);
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
+ preempt_check_resched_rt();
}
EXPORT_SYMBOL(__dev_kfree_skb_irq);
@@ -3233,6 +3241,7 @@ enqueue:
rps_unlock(sd);
local_irq_restore(flags);
+ preempt_check_resched_rt();
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
@@ -3255,7 +3264,7 @@ static int netif_rx_internal(struct sk_buff *skb)
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
- preempt_disable();
+ migrate_disable();
rcu_read_lock();
cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -3265,13 +3274,13 @@ static int netif_rx_internal(struct sk_buff *skb)
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
- preempt_enable();
+ migrate_enable();
} else
#endif
{
unsigned int qtail;
- ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
- put_cpu();
+ ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
+ put_cpu_light();
}
return ret;
}
@@ -3305,16 +3314,44 @@ int netif_rx_ni(struct sk_buff *skb)
trace_netif_rx_ni_entry(skb);
- preempt_disable();
+ local_bh_disable();
err = netif_rx_internal(skb);
- if (local_softirq_pending())
- do_softirq();
- preempt_enable();
+ local_bh_enable();
return err;
}
EXPORT_SYMBOL(netif_rx_ni);
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * RT runs ksoftirqd as a real time thread and the root_lock is a
+ * "sleeping spinlock". If the trylock fails then we can go into an
+ * infinite loop when ksoftirqd preempted the task which actually
+ * holds the lock, because we requeue q and raise NET_TX softirq
+ * causing ksoftirqd to loop forever.
+ *
+ * It's safe to use spin_lock on RT here as softirqs run in thread
+ * context and cannot deadlock against the thread which is holding
+ * root_lock.
+ *
+ * On !RT the trylock might fail, but there we bail out from the
+ * softirq loop after 10 attempts which we can't do on RT. And the
+ * task holding root_lock cannot be preempted, so the only downside of
+ * that trylock is that we need 10 loops to decide that we should have
+ * given up in the first one :)
+ */
+static inline int take_root_lock(spinlock_t *lock)
+{
+ spin_lock(lock);
+ return 1;
+}
+#else
+static inline int take_root_lock(spinlock_t *lock)
+{
+ return spin_trylock(lock);
+}
+#endif
+
static void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = &__get_cpu_var(softnet_data);
@@ -3356,7 +3393,7 @@ static void net_tx_action(struct softirq_action *h)
head = head->next_sched;
root_lock = qdisc_lock(q);
- if (spin_trylock(root_lock)) {
+ if (take_root_lock(root_lock)) {
smp_mb__before_clear_bit();
clear_bit(__QDISC_STATE_SCHED,
&q->state);
@@ -3754,7 +3791,7 @@ static void flush_backlog(void *arg)
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
if (skb->dev == dev) {
__skb_unlink(skb, &sd->input_pkt_queue);
- kfree_skb(skb);
+ __skb_queue_tail(&sd->tofree_queue, skb);
input_queue_head_incr(sd);
}
}
@@ -3763,10 +3800,13 @@ static void flush_backlog(void *arg)
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
if (skb->dev == dev) {
__skb_unlink(skb, &sd->process_queue);
- kfree_skb(skb);
+ __skb_queue_tail(&sd->tofree_queue, skb);
input_queue_head_incr(sd);
}
}
+
+ if (!skb_queue_empty(&sd->tofree_queue))
+ raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
static int napi_gro_complete(struct sk_buff *skb)
@@ -4153,6 +4193,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
} else
#endif
local_irq_enable();
+ preempt_check_resched_rt();
}
static int process_backlog(struct napi_struct *napi, int quota)
@@ -4225,6 +4266,7 @@ void __napi_schedule(struct napi_struct *n)
local_irq_save(flags);
____napi_schedule(&__get_cpu_var(softnet_data), n);
local_irq_restore(flags);
+ preempt_check_resched_rt();
}
EXPORT_SYMBOL(__napi_schedule);
@@ -4347,10 +4389,17 @@ static void net_rx_action(struct softirq_action *h)
struct softnet_data *sd = &__get_cpu_var(softnet_data);
unsigned long time_limit = jiffies + 2;
int budget = netdev_budget;
+ struct sk_buff *skb;
void *have;
local_irq_disable();
+ while ((skb = __skb_dequeue(&sd->tofree_queue))) {
+ local_irq_enable();
+ kfree_skb(skb);
+ local_irq_disable();
+ }
+
while (!list_empty(&sd->poll_list)) {
struct napi_struct *n;
int work, weight;
@@ -6753,6 +6802,7 @@ static int dev_cpu_callback(struct notifier_block *nfb,
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
+ preempt_check_resched_rt();
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
@@ -6763,6 +6813,9 @@ static int dev_cpu_callback(struct notifier_block *nfb,
netif_rx_internal(skb);
input_queue_head_incr(oldsd);
}
+ while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
+ kfree_skb(skb);
+ }
return NOTIFY_OK;
}
@@ -7072,8 +7125,9 @@ static int __init net_dev_init(void)
for_each_possible_cpu(i) {
struct softnet_data *sd = &per_cpu(softnet_data, i);
- skb_queue_head_init(&sd->input_pkt_queue);
- skb_queue_head_init(&sd->process_queue);
+ skb_queue_head_init_raw(&sd->input_pkt_queue);
+ skb_queue_head_init_raw(&sd->process_queue);
+ skb_queue_head_init_raw(&sd->tofree_queue);
INIT_LIST_HEAD(&sd->poll_list);
sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8f6391bbf509..8b3aac58cc47 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -62,6 +62,7 @@
#include <linux/scatterlist.h>
#include <linux/errqueue.h>
#include <linux/prefetch.h>
+#include <linux/locallock.h>
#include <net/protocol.h>
#include <net/dst.h>
@@ -335,6 +336,7 @@ struct netdev_alloc_cache {
unsigned int pagecnt_bias;
};
static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
+static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
@@ -343,7 +345,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
int order;
unsigned long flags;
- local_irq_save(flags);
+ local_lock_irqsave(netdev_alloc_lock, flags);
nc = &__get_cpu_var(netdev_alloc_cache);
if (unlikely(!nc->frag.page)) {
refill:
@@ -377,7 +379,7 @@ recycle:
nc->frag.offset += fragsz;
nc->pagecnt_bias--;
end:
- local_irq_restore(flags);
+ local_unlock_irqrestore(netdev_alloc_lock, flags);
return data;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index c8069561bdb7..479a25a271d5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2386,12 +2386,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
if (sk->sk_lock.owned)
__lock_sock(sk);
sk->sk_lock.owned = 1;
- spin_unlock(&sk->sk_lock.slock);
+ spin_unlock_bh(&sk->sk_lock.slock);
/*
* The sk_lock has mutex_lock() semantics here:
*/
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
- local_bh_enable();
}
EXPORT_SYMBOL(lock_sock_nested);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 1e4aa8354f93..33fa45891c29 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -69,6 +69,7 @@
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/fcntl.h>
+#include <linux/sysrq.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
@@ -801,6 +802,30 @@ static void icmp_redirect(struct sk_buff *skb)
}
/*
+ * 32bit and 64bit have different timestamp length, so we check for
+ * the cookie at offset 20 and verify it is repeated at offset 50
+ */
+#define CO_POS0 20
+#define CO_POS1 50
+#define CO_SIZE sizeof(int)
+#define ICMP_SYSRQ_SIZE 57
+
+/*
+ * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
+ * pattern and if it matches send the next byte as a trigger to sysrq.
+ */
+static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
+{
+ int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
+ char *p = skb->data;
+
+ if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
+ !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
+ p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
+ handle_sysrq(p[CO_POS0 + CO_SIZE]);
+}
+
+/*
* Handle ICMP_ECHO ("ping") requests.
*
* RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
@@ -827,6 +852,11 @@ static void icmp_echo(struct sk_buff *skb)
icmp_param.data_len = skb->len;
icmp_param.head_len = sizeof(struct icmphdr);
icmp_reply(&icmp_param, skb);
+
+ if (skb->len == ICMP_SYSRQ_SIZE &&
+ net->ipv4.sysctl_icmp_echo_sysrq) {
+ icmp_check_sysrq(net, skb);
+ }
}
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ed88d781248f..e9f04e0bc56a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -79,6 +79,7 @@
#include <linux/mroute.h>
#include <linux/netlink.h>
#include <linux/tcp.h>
+#include <linux/locallock.h>
int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
EXPORT_SYMBOL(sysctl_ip_default_ttl);
@@ -1476,6 +1477,9 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
.uc_ttl = -1,
};
+/* serialize concurrent calls on the same CPU to ip_send_unicast_reply */
+static DEFINE_LOCAL_IRQ_LOCK(unicast_lock);
+
void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
__be32 saddr, const struct ip_reply_arg *arg,
unsigned int len)
@@ -1515,7 +1519,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
if (IS_ERR(rt))
return;
- inet = &get_cpu_var(unicast_sock);
+ inet = &get_locked_var(unicast_lock, unicast_sock);
inet->tos = arg->tos;
sk = &inet->sk;
@@ -1539,7 +1543,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
ip_push_pending_frames(sk, &fl4);
}
- put_cpu_var(unicast_sock);
+ put_locked_var(unicast_lock, unicast_sock);
ip_rt_put(rt);
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 44eba052b43d..dd69fa66d3bc 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -776,6 +776,13 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "icmp_echo_sysrq",
+ .data = &init_net.ipv4.sysctl_icmp_echo_sysrq,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "icmp_ignore_bogus_error_responses",
.data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
.maxlen = sizeof(int),
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 3e57f96c9666..212bcc278306 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3346,7 +3346,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
struct ieee80211_supported_band *sband;
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
- WARN_ON_ONCE(softirq_count() == 0);
+ WARN_ON_ONCE_NONRT(softirq_count() == 0);
if (WARN_ON(status->band >= IEEE80211_NUM_BANDS))
goto drop;
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 1fbab0cdd302..34a7b7d470c0 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -21,11 +21,17 @@
#include <linux/proc_fs.h>
#include <linux/mutex.h>
#include <linux/slab.h>
+#include <linux/locallock.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include "nf_internals.h"
+#ifdef CONFIG_PREEMPT_RT_BASE
+DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
+EXPORT_PER_CPU_SYMBOL(xt_write_lock);
+#endif
+
static DEFINE_MUTEX(afinfo_mutex);
const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 48a6a93db296..5a2708ced63e 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -63,6 +63,7 @@
#include <linux/if_packet.h>
#include <linux/wireless.h>
#include <linux/kernel.h>
+#include <linux/delay.h>
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -700,7 +701,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
if (BLOCK_NUM_PKTS(pbd)) {
while (atomic_read(&pkc->blk_fill_in_prog)) {
/* Waiting for skb_copy_bits to finish... */
- cpu_relax();
+ cpu_chill();
}
}
@@ -951,7 +952,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
if (!(status & TP_STATUS_BLK_TMO)) {
while (atomic_read(&pkc->blk_fill_in_prog)) {
/* Waiting for skb_copy_bits to finish... */
- cpu_relax();
+ cpu_chill();
}
}
prb_close_block(pkc, pbd, po, status);
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index e8fdb172adbb..5a44c6e77cd8 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -34,6 +34,7 @@
#include <linux/slab.h>
#include <linux/rculist.h>
#include <linux/llist.h>
+#include <linux/delay.h>
#include "rds.h"
#include "ib.h"
@@ -286,7 +287,7 @@ static inline void wait_clean_list_grace(void)
for_each_online_cpu(cpu) {
flag = &per_cpu(clean_list_grace, cpu);
while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
- cpu_relax();
+ cpu_chill();
}
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index e82e43b69c33..9dbe76beedb0 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -850,7 +850,7 @@ void dev_deactivate_many(struct list_head *head)
/* Wait for outstanding qdisc_run calls. */
list_for_each_entry(dev, head, close_list)
while (some_qdisc_is_busy(dev))
- yield();
+ msleep(1);
}
void dev_deactivate(struct net_device *dev)