aboutsummaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/bridge/br_netfilter.c17
-rw-r--r--net/core/skbuff.c30
-rw-r--r--net/ipv4/ip_forward.c3
-rw-r--r--net/ipv4/ping.c1
-rw-r--r--net/ipv4/tcp_output.c64
-rw-r--r--net/netlink/af_netlink.c6
6 files changed, 90 insertions, 31 deletions
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 1a4f32c09ad5..f076a8ede00b 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -650,6 +650,13 @@ static int br_nf_forward_finish(struct sk_buff *skb)
struct net_device *in;
if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) {
+ int frag_max_size;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ frag_max_size = IPCB(skb)->frag_max_size;
+ BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size;
+ }
+
in = nf_bridge->physindev;
if (nf_bridge->mask & BRNF_PKT_TYPE) {
skb->pkt_type = PACKET_OTHERHOST;
@@ -709,8 +716,14 @@ static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops,
nf_bridge->mask |= BRNF_PKT_TYPE;
}
- if (pf == NFPROTO_IPV4 && br_parse_ip_options(skb))
- return NF_DROP;
+ if (pf == NFPROTO_IPV4) {
+ int frag_max = BR_INPUT_SKB_CB(skb)->frag_max_size;
+
+ if (br_parse_ip_options(skb))
+ return NF_DROP;
+
+ IPCB(skb)->frag_max_size = frag_max;
+ }
/* The physdev module checks on this */
nf_bridge->mask |= BRNF_BRIDGED;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 17fd8dca921e..02ebb7133312 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -278,13 +278,14 @@ nodata:
EXPORT_SYMBOL(__alloc_skb);
/**
- * build_skb - build a network buffer
+ * __build_skb - build a network buffer
* @data: data buffer provided by caller
- * @frag_size: size of fragment, or 0 if head was kmalloced
+ * @frag_size: size of data, or 0 if head was kmalloced
*
* Allocate a new &sk_buff. Caller provides space holding head and
* skb_shared_info. @data must have been allocated by kmalloc() only if
- * @frag_size is 0, otherwise data should come from the page allocator.
+ * @frag_size is 0, otherwise data should come from the page allocator
+ * or vmalloc()
* The return is the new skb buffer.
* On a failure the return is %NULL, and @data is not freed.
* Notes :
@@ -295,7 +296,7 @@ EXPORT_SYMBOL(__alloc_skb);
* before giving packet to stack.
* RX rings only contains data buffers, not full skbs.
*/
-struct sk_buff *build_skb(void *data, unsigned int frag_size)
+struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
struct skb_shared_info *shinfo;
struct sk_buff *skb;
@@ -309,7 +310,6 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
memset(skb, 0, offsetof(struct sk_buff, tail));
skb->truesize = SKB_TRUESIZE(size);
- skb->head_frag = frag_size != 0;
atomic_set(&skb->users, 1);
skb->head = data;
skb->data = data;
@@ -326,6 +326,23 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
return skb;
}
+
+/* build_skb() is wrapper over __build_skb(), that specifically
+ * takes care of skb->head and skb->pfmemalloc
+ * This means that if @frag_size is not zero, then @data must be backed
+ * by a page fragment, not kmalloc() or vmalloc()
+ */
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
+{
+ struct sk_buff *skb = __build_skb(data, frag_size);
+
+ if (skb && frag_size) {
+ skb->head_frag = 1;
+ if (virt_to_head_page(data)->pfmemalloc)
+ skb->pfmemalloc = 1;
+ }
+ return skb;
+}
EXPORT_SYMBOL(build_skb);
struct netdev_alloc_cache {
@@ -352,7 +369,8 @@ refill:
gfp_t gfp = gfp_mask;
if (order)
- gfp |= __GFP_COMP | __GFP_NOWARN;
+ gfp |= __GFP_COMP | __GFP_NOWARN |
+ __GFP_NOMEMALLOC;
nc->frag.page = alloc_pages(gfp, order);
if (likely(nc->frag.page))
break;
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 787b3c294ce6..d5410b57da19 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -81,6 +81,9 @@ int ip_forward(struct sk_buff *skb)
if (skb->pkt_type != PACKET_HOST)
goto drop;
+ if (unlikely(skb->sk))
+ goto drop;
+
if (skb_warn_if_lro(skb))
goto drop;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index a5c49d657ab1..64f4edb2dbf9 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -158,6 +158,7 @@ void ping_unhash(struct sock *sk)
if (sk_hashed(sk)) {
write_lock_bh(&ping_table.lock);
hlist_nulls_del(&sk->sk_nulls_node);
+ sk_nulls_node_init(&sk->sk_nulls_node);
sock_put(sk);
isk->inet_num = 0;
isk->inet_sport = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 32dcb4e05b6b..dc9f925b0cd5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2717,39 +2717,65 @@ begin_fwd:
}
}
-/* Send a fin. The caller locks the socket for us. This cannot be
- * allowed to fail queueing a FIN frame under any circumstances.
+/* We allow to exceed memory limits for FIN packets to expedite
+ * connection tear down and (memory) recovery.
+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
+ * or even be forced to close flow without any FIN.
+ */
+static void sk_forced_wmem_schedule(struct sock *sk, int size)
+{
+ int amt, status;
+
+ if (size <= sk->sk_forward_alloc)
+ return;
+ amt = sk_mem_pages(size);
+ sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+ sk_memory_allocated_add(sk, amt, &status);
+}
+
+/* Send a FIN. The caller locks the socket for us.
+ * We should try to send a FIN packet really hard, but eventually give up.
*/
void tcp_send_fin(struct sock *sk)
{
+ struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb = tcp_write_queue_tail(sk);
- int mss_now;
- /* Optimization, tack on the FIN if we have a queue of
- * unsent frames. But be careful about outgoing SACKS
- * and IP options.
+ /* Optimization, tack on the FIN if we have one skb in write queue and
+ * this skb was not yet sent, or we are under memory pressure.
+ * Note: in the latter case, FIN packet will be sent after a timeout,
+ * as TCP stack thinks it has already been transmitted.
*/
- mss_now = tcp_current_mss(sk);
-
- if (tcp_send_head(sk) != NULL) {
- TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
- TCP_SKB_CB(skb)->end_seq++;
+ if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
+coalesce:
+ TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
+ TCP_SKB_CB(tskb)->end_seq++;
tp->write_seq++;
+ if (!tcp_send_head(sk)) {
+ /* This means tskb was already sent.
+ * Pretend we included the FIN on previous transmit.
+ * We need to set tp->snd_nxt to the value it would have
+ * if FIN had been sent. This is because retransmit path
+ * does not change tp->snd_nxt.
+ */
+ tp->snd_nxt++;
+ return;
+ }
} else {
- /* Socket is locked, keep trying until memory is available. */
- for (;;) {
- skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
- if (skb)
- break;
- yield();
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
+ if (unlikely(!skb)) {
+ if (tskb)
+ goto coalesce;
+ return;
}
+ skb_reserve(skb, MAX_TCP_HEADER);
+ sk_forced_wmem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
tcp_init_nondata_skb(skb, tp->write_seq,
TCPHDR_ACK | TCPHDR_FIN);
tcp_queue_skb(sk, skb);
}
- __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
}
/* We get here when a process closes a file descriptor (either due to
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 79c965a51ab2..c0a418766f9c 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1599,13 +1599,11 @@ static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
if (data == NULL)
return NULL;
- skb = build_skb(data, size);
+ skb = __build_skb(data, size);
if (skb == NULL)
vfree(data);
- else {
- skb->head_frag = 0;
+ else
skb->destructor = netlink_skb_destructor;
- }
return skb;
}