aboutsummaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/ceph/osd_client.c31
-rw-r--r--net/core/dev.c4
-rw-r--r--net/core/skbuff.c30
-rw-r--r--net/core/sock.c19
-rw-r--r--net/ipv4/ip_forward.c3
-rw-r--r--net/ipv4/tcp_input.c7
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_output.c66
-rw-r--r--net/ipv6/ip6_output.c3
-rw-r--r--net/ipv6/ndisc.c9
-rw-r--r--net/ipv6/tcp_ipv6.c2
-rw-r--r--net/mac80211/wep.c6
-rw-r--r--net/netfilter/nf_conntrack_proto_generic.c26
-rw-r--r--net/netlink/af_netlink.c6
-rw-r--r--net/socket.c24
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c23
16 files changed, 189 insertions, 72 deletions
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 45f077c60348..5b7ef5bb90ed 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1932,20 +1932,29 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
err = __map_request(osdc, req,
force_resend || force_resend_writes);
dout("__map_request returned %d\n", err);
- if (err == 0)
- continue; /* no change and no osd was specified */
if (err < 0)
continue; /* hrm! */
- if (req->r_osd == NULL) {
- dout("tid %llu maps to no valid osd\n", req->r_tid);
- needmap++; /* request a newer map */
- continue;
- }
+ if (req->r_osd == NULL || err > 0) {
+ if (req->r_osd == NULL) {
+ dout("lingering %p tid %llu maps to no osd\n",
+ req, req->r_tid);
+ /*
+ * A homeless lingering request makes
+ * no sense, as it's job is to keep
+ * a particular OSD connection open.
+ * Request a newer map and kick the
+ * request, knowing that it won't be
+ * resent until we actually get a map
+ * that can tell us where to send it.
+ */
+ needmap++;
+ }
- dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1);
- __register_request(osdc, req);
- __unregister_linger_request(osdc, req);
+ dout("kicking lingering %p tid %llu osd%d\n", req,
+ req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
+ __register_request(osdc, req);
+ __unregister_linger_request(osdc, req);
+ }
}
reset_changed_osds(osdc);
mutex_unlock(&osdc->request_mutex);
diff --git a/net/core/dev.c b/net/core/dev.c
index f6d8d7fe29ab..73abbd77d72c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2775,7 +2775,9 @@ static void skb_update_prio(struct sk_buff *skb)
#define skb_update_prio(skb)
#endif
-static DEFINE_PER_CPU(int, xmit_recursion);
+DEFINE_PER_CPU(int, xmit_recursion);
+EXPORT_SYMBOL(xmit_recursion);
+
#define RECURSION_LIMIT 10
/**
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e2b1bba69882..69ec61abfb37 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -278,13 +278,14 @@ nodata:
EXPORT_SYMBOL(__alloc_skb);
/**
- * build_skb - build a network buffer
+ * __build_skb - build a network buffer
* @data: data buffer provided by caller
- * @frag_size: size of fragment, or 0 if head was kmalloced
+ * @frag_size: size of data, or 0 if head was kmalloced
*
* Allocate a new &sk_buff. Caller provides space holding head and
* skb_shared_info. @data must have been allocated by kmalloc() only if
- * @frag_size is 0, otherwise data should come from the page allocator.
+ * @frag_size is 0, otherwise data should come from the page allocator
+ * or vmalloc()
* The return is the new skb buffer.
* On a failure the return is %NULL, and @data is not freed.
* Notes :
@@ -295,7 +296,7 @@ EXPORT_SYMBOL(__alloc_skb);
* before giving packet to stack.
* RX rings only contains data buffers, not full skbs.
*/
-struct sk_buff *build_skb(void *data, unsigned int frag_size)
+struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
struct skb_shared_info *shinfo;
struct sk_buff *skb;
@@ -309,7 +310,6 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
memset(skb, 0, offsetof(struct sk_buff, tail));
skb->truesize = SKB_TRUESIZE(size);
- skb->head_frag = frag_size != 0;
atomic_set(&skb->users, 1);
skb->head = data;
skb->data = data;
@@ -326,6 +326,23 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
return skb;
}
+
+/* build_skb() is wrapper over __build_skb(), that specifically
+ * takes care of skb->head and skb->pfmemalloc
+ * This means that if @frag_size is not zero, then @data must be backed
+ * by a page fragment, not kmalloc() or vmalloc()
+ */
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
+{
+ struct sk_buff *skb = __build_skb(data, frag_size);
+
+ if (skb && frag_size) {
+ skb->head_frag = 1;
+ if (virt_to_head_page(data)->pfmemalloc)
+ skb->pfmemalloc = 1;
+ }
+ return skb;
+}
EXPORT_SYMBOL(build_skb);
struct netdev_alloc_cache {
@@ -352,7 +369,8 @@ refill:
gfp_t gfp = gfp_mask;
if (order)
- gfp |= __GFP_COMP | __GFP_NOWARN;
+ gfp |= __GFP_COMP | __GFP_NOWARN |
+ __GFP_NOMEMALLOC;
nc->frag.page = alloc_pages(gfp, order);
if (likely(nc->frag.page))
break;
diff --git a/net/core/sock.c b/net/core/sock.c
index c8069561bdb7..650dd58ebd05 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -659,6 +659,25 @@ static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
sock_reset_flag(sk, bit);
}
+bool sk_mc_loop(struct sock *sk)
+{
+ if (dev_recursion_level())
+ return false;
+ if (!sk)
+ return true;
+ switch (sk->sk_family) {
+ case AF_INET:
+ return inet_sk(sk)->mc_loop;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ return inet6_sk(sk)->mc_loop;
+#endif
+ }
+ WARN_ON(1);
+ return true;
+}
+EXPORT_SYMBOL(sk_mc_loop);
+
/*
* This is meant for all protocols to use and covers goings on
* at the socket level. Everything here is generic.
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index ecb34b5ea42f..57075c4508f7 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -127,6 +127,9 @@ int ip_forward(struct sk_buff *skb)
struct rtable *rt; /* Route we use */
struct ip_options *opt = &(IPCB(skb)->opt);
+ if (unlikely(skb->sk))
+ goto drop;
+
if (skb_warn_if_lro(skb))
goto drop;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b4842383aa2d..d3d2c0abbad7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3065,10 +3065,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (seq_rtt < 0) {
seq_rtt = ca_seq_rtt;
}
- if (!(sacked & TCPCB_SACKED_ACKED))
+ if (!(sacked & TCPCB_SACKED_ACKED)) {
reord = min(pkts_acked, reord);
- if (!after(scb->end_seq, tp->high_seq))
- flag |= FLAG_ORIG_SACK_ACKED;
+ if (!after(scb->end_seq, tp->high_seq))
+ flag |= FLAG_ORIG_SACK_ACKED;
+ }
}
if (sacked & TCPCB_SACKED_ACKED)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3530ad85c47c..ed04620a35ed 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1876,7 +1876,7 @@ void tcp_v4_early_demux(struct sk_buff *skb)
skb->sk = sk;
skb->destructor = sock_edemux;
if (sk->sk_state != TCP_TIME_WAIT) {
- struct dst_entry *dst = sk->sk_rx_dst;
+ struct dst_entry *dst = ACCESS_ONCE(sk->sk_rx_dst);
if (dst)
dst = dst_check(dst, 0);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index bbc40bb84373..ecd60bd9d616 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2595,39 +2595,65 @@ begin_fwd:
}
}
-/* Send a fin. The caller locks the socket for us. This cannot be
- * allowed to fail queueing a FIN frame under any circumstances.
+/* We allow to exceed memory limits for FIN packets to expedite
+ * connection tear down and (memory) recovery.
+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
+ * or even be forced to close flow without any FIN.
+ */
+static void sk_forced_wmem_schedule(struct sock *sk, int size)
+{
+ int amt, status;
+
+ if (size <= sk->sk_forward_alloc)
+ return;
+ amt = sk_mem_pages(size);
+ sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+ sk_memory_allocated_add(sk, amt, &status);
+}
+
+/* Send a FIN. The caller locks the socket for us.
+ * We should try to send a FIN packet really hard, but eventually give up.
*/
void tcp_send_fin(struct sock *sk)
{
+ struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb = tcp_write_queue_tail(sk);
- int mss_now;
- /* Optimization, tack on the FIN if we have a queue of
- * unsent frames. But be careful about outgoing SACKS
- * and IP options.
+ /* Optimization, tack on the FIN if we have one skb in write queue and
+ * this skb was not yet sent, or we are under memory pressure.
+ * Note: in the latter case, FIN packet will be sent after a timeout,
+ * as TCP stack thinks it has already been transmitted.
*/
- mss_now = tcp_current_mss(sk);
-
- if (tcp_send_head(sk) != NULL) {
- TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
- TCP_SKB_CB(skb)->end_seq++;
+ if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
+coalesce:
+ TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
+ TCP_SKB_CB(tskb)->end_seq++;
tp->write_seq++;
+ if (!tcp_send_head(sk)) {
+ /* This means tskb was already sent.
+ * Pretend we included the FIN on previous transmit.
+ * We need to set tp->snd_nxt to the value it would have
+ * if FIN had been sent. This is because retransmit path
+ * does not change tp->snd_nxt.
+ */
+ tp->snd_nxt++;
+ return;
+ }
} else {
- /* Socket is locked, keep trying until memory is available. */
- for (;;) {
- skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
- if (skb)
- break;
- yield();
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
+ if (unlikely(!skb)) {
+ if (tskb)
+ goto coalesce;
+ return;
}
+ skb_reserve(skb, MAX_TCP_HEADER);
+ sk_forced_wmem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
tcp_init_nondata_skb(skb, tp->write_seq,
TCPHDR_ACK | TCPHDR_FIN);
tcp_queue_skb(sk, skb);
}
- __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
}
/* We get here when a process closes a file descriptor (either due to
@@ -2796,6 +2822,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
}
#endif
+ /* Do not fool tcpdump (if any), clean our debris */
+ skb->tstamp.tv64 = 0;
return skb;
}
EXPORT_SYMBOL(tcp_make_synack);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index d7907ecf0b75..066d0b03f2b8 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -555,7 +555,8 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
{
struct sk_buff *frag;
struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
- struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
+ struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
+ inet6_sk(skb->sk) : NULL;
struct ipv6hdr *tmp_hdr;
struct frag_hdr *fh;
unsigned int mtu, hlen, left, len;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 09a22f4f36c9..bcd65186b497 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1193,7 +1193,14 @@ static void ndisc_router_discovery(struct sk_buff *skb)
if (rt)
rt6_set_expires(rt, jiffies + (HZ * lifetime));
if (ra_msg->icmph.icmp6_hop_limit) {
- in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
+ /* Only set hop_limit on the interface if it is higher than
+ * the current hop_limit.
+ */
+ if (in6_dev->cnf.hop_limit < ra_msg->icmph.icmp6_hop_limit) {
+ in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
+ } else {
+ ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than current\n");
+ }
if (rt)
dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
ra_msg->icmph.icmp6_hop_limit);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3291857694b1..a9b6ea11da29 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1636,7 +1636,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb)
skb->sk = sk;
skb->destructor = sock_edemux;
if (sk->sk_state != TCP_TIME_WAIT) {
- struct dst_entry *dst = sk->sk_rx_dst;
+ struct dst_entry *dst = ACCESS_ONCE(sk->sk_rx_dst);
if (dst)
dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie);
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index 6ee2b5863572..f21b142dee1f 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -98,8 +98,7 @@ static u8 *ieee80211_wep_add_iv(struct ieee80211_local *local,
hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
- if (WARN_ON(skb_tailroom(skb) < IEEE80211_WEP_ICV_LEN ||
- skb_headroom(skb) < IEEE80211_WEP_IV_LEN))
+ if (WARN_ON(skb_headroom(skb) < IEEE80211_WEP_IV_LEN))
return NULL;
hdrlen = ieee80211_hdrlen(hdr->frame_control);
@@ -169,6 +168,9 @@ int ieee80211_wep_encrypt(struct ieee80211_local *local,
size_t len;
u8 rc4key[3 + WLAN_KEY_LEN_WEP104];
+ if (WARN_ON(skb_tailroom(skb) < IEEE80211_WEP_ICV_LEN))
+ return -1;
+
iv = ieee80211_wep_add_iv(local, skb, keylen, keyidx);
if (!iv)
return -1;
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index d25f29377648..957c1db66652 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -14,6 +14,30 @@
static unsigned int nf_ct_generic_timeout __read_mostly = 600*HZ;
+static bool nf_generic_should_process(u8 proto)
+{
+ switch (proto) {
+#ifdef CONFIG_NF_CT_PROTO_SCTP_MODULE
+ case IPPROTO_SCTP:
+ return false;
+#endif
+#ifdef CONFIG_NF_CT_PROTO_DCCP_MODULE
+ case IPPROTO_DCCP:
+ return false;
+#endif
+#ifdef CONFIG_NF_CT_PROTO_GRE_MODULE
+ case IPPROTO_GRE:
+ return false;
+#endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE_MODULE
+ case IPPROTO_UDPLITE:
+ return false;
+#endif
+ default:
+ return true;
+ }
+}
+
static inline struct nf_generic_net *generic_pernet(struct net *net)
{
return &net->ct.nf_ct_proto.generic;
@@ -67,7 +91,7 @@ static int generic_packet(struct nf_conn *ct,
static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
unsigned int dataoff, unsigned int *timeouts)
{
- return true;
+ return nf_generic_should_process(nf_ct_protonum(ct));
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 1d52506bda14..a0b0ea949192 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1624,13 +1624,11 @@ static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
if (data == NULL)
return NULL;
- skb = build_skb(data, size);
+ skb = __build_skb(data, size);
if (skb == NULL)
vfree(data);
- else {
- skb->head_frag = 0;
+ else
skb->destructor = netlink_skb_destructor;
- }
return skb;
}
diff --git a/net/socket.c b/net/socket.c
index 1b2c2d62ff20..b72fc137e1a6 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2007,14 +2007,12 @@ static int ___sys_sendmsg(struct socket *sock, struct msghdr __user *msg,
int err, ctl_len, total_len;
err = -EFAULT;
- if (MSG_CMSG_COMPAT & flags) {
- if (get_compat_msghdr(msg_sys, msg_compat))
- return -EFAULT;
- } else {
+ if (MSG_CMSG_COMPAT & flags)
+ err = get_compat_msghdr(msg_sys, msg_compat);
+ else
err = copy_msghdr_from_user(msg_sys, msg);
- if (err)
- return err;
- }
+ if (err)
+ return err;
if (msg_sys->msg_iovlen > UIO_FASTIOV) {
err = -EMSGSIZE;
@@ -2219,14 +2217,12 @@ static int ___sys_recvmsg(struct socket *sock, struct msghdr __user *msg,
struct sockaddr __user *uaddr;
int __user *uaddr_len;
- if (MSG_CMSG_COMPAT & flags) {
- if (get_compat_msghdr(msg_sys, msg_compat))
- return -EFAULT;
- } else {
+ if (MSG_CMSG_COMPAT & flags)
+ err = get_compat_msghdr(msg_sys, msg_compat);
+ else
err = copy_msghdr_from_user(msg_sys, msg);
- if (err)
- return err;
- }
+ if (err)
+ return err;
if (msg_sys->msg_iovlen > UIO_FASTIOV) {
err = -EMSGSIZE;
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index 1ec19f6f0c2b..eeeba5adee6d 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -793,20 +793,26 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
{
u32 value_follows;
int err;
+ struct page *scratch;
+
+ scratch = alloc_page(GFP_KERNEL);
+ if (!scratch)
+ return -ENOMEM;
+ xdr_set_scratch_buffer(xdr, page_address(scratch), PAGE_SIZE);
/* res->status */
err = gssx_dec_status(xdr, &res->status);
if (err)
- return err;
+ goto out_free;
/* res->context_handle */
err = gssx_dec_bool(xdr, &value_follows);
if (err)
- return err;
+ goto out_free;
if (value_follows) {
err = gssx_dec_ctx(xdr, res->context_handle);
if (err)
- return err;
+ goto out_free;
} else {
res->context_handle = NULL;
}
@@ -814,11 +820,11 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
/* res->output_token */
err = gssx_dec_bool(xdr, &value_follows);
if (err)
- return err;
+ goto out_free;
if (value_follows) {
err = gssx_dec_buffer(xdr, res->output_token);
if (err)
- return err;
+ goto out_free;
} else {
res->output_token = NULL;
}
@@ -826,14 +832,17 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
/* res->delegated_cred_handle */
err = gssx_dec_bool(xdr, &value_follows);
if (err)
- return err;
+ goto out_free;
if (value_follows) {
/* we do not support upcall servers sending this data. */
- return -EINVAL;
+ err = -EINVAL;
+ goto out_free;
}
/* res->options */
err = gssx_dec_option_array(xdr, &res->options);
+out_free:
+ __free_page(scratch);
return err;
}