aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/ip-sysctl.txt9
-rw-r--r--include/net/sock.h2
-rw-r--r--include/net/tcp.h1
-rw-r--r--net/core/sock.c1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c10
-rw-r--r--net/ipv4/tcp.c28
-rw-r--r--net/ipv4/tcp_input.c34
-rw-r--r--net/ipv4/tcp_output.c2
8 files changed, 80 insertions, 7 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 3458d6343e0..3994f0bbeeb 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -478,6 +478,15 @@ tcp_syn_retries - INTEGER
tcp_timestamps - BOOLEAN
Enable timestamps as defined in RFC1323.
+tcp_min_tso_segs - INTEGER
+ Minimal number of segments per TSO frame.
+ Since linux-3.12, TCP does an automatic sizing of TSO frames,
+ depending on flow rate, instead of filling 64Kbytes packets.
+ For specific usages, it's possible to force TCP to build big
+ TSO frames. Note that TCP stack might split too big TSO packets
+ if available window is too small.
+ Default: 2
+
tcp_tso_win_divisor - INTEGER
This allows control over what percentage of the congestion window
can be consumed by a single TSO frame.
diff --git a/include/net/sock.h b/include/net/sock.h
index 66772cf8c3c..cec4c723db9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -230,6 +230,7 @@ struct cg_proto;
* @sk_wmem_queued: persistent queue size
* @sk_forward_alloc: space allocated forward
* @sk_allocation: allocation mode
+ * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
* @sk_sndbuf: size of send buffer in bytes
* @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
* %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -355,6 +356,7 @@ struct sock {
kmemcheck_bitfield_end(flags);
int sk_wmem_queued;
gfp_t sk_allocation;
+ u32 sk_pacing_rate; /* bytes per second */
netdev_features_t sk_route_caps;
netdev_features_t sk_route_nocaps;
int sk_gso_type;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5bba80fbd1d..3fc77e90624 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -287,6 +287,7 @@ extern int sysctl_tcp_thin_dupack;
extern int sysctl_tcp_early_retrans;
extern int sysctl_tcp_limit_output_bytes;
extern int sysctl_tcp_challenge_ack_limit;
+extern int sysctl_tcp_min_tso_segs;
extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/core/sock.c b/net/core/sock.c
index d6d024cfaaa..6565431b0e6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2271,6 +2271,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_stamp = ktime_set(-1L, 0);
+ sk->sk_pacing_rate = ~0U;
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3f25e75ae69..90b26beb84d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
static int zero;
static int one = 1;
static int four = 4;
+static int gso_max_segs = GSO_MAX_SEGS;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -753,6 +754,15 @@ static struct ctl_table ipv4_table[] = {
.extra2 = &four,
},
{
+ .procname = "tcp_min_tso_segs",
+ .data = &sysctl_tcp_min_tso_segs,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &gso_max_segs,
+ },
+ {
.procname = "udp_mem",
.data = &sysctl_udp_mem,
.maxlen = sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2b1b57f213b..c888abf5a72 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -282,6 +282,8 @@
int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
+int sysctl_tcp_min_tso_segs __read_mostly = 2;
+
struct percpu_counter tcp_orphan_count;
EXPORT_SYMBOL_GPL(tcp_orphan_count);
@@ -786,12 +788,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
xmit_size_goal = mss_now;
if (large_allowed && sk_can_gso(sk)) {
- xmit_size_goal = ((sk->sk_gso_max_size - 1) -
- inet_csk(sk)->icsk_af_ops->net_header_len -
- inet_csk(sk)->icsk_ext_hdr_len -
- tp->tcp_header_len);
+ u32 gso_size, hlen;
+
+ /* Maybe we should/could use sk->sk_prot->max_header here ? */
+ hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
+ inet_csk(sk)->icsk_ext_hdr_len +
+ tp->tcp_header_len;
+
+ /* Goal is to send at least one packet per ms,
+ * not one big TSO packet every 100 ms.
+ * This preserves ACK clocking and is consistent
+ * with tcp_tso_should_defer() heuristic.
+ */
+ gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
+ gso_size = max_t(u32, gso_size,
+ sysctl_tcp_min_tso_segs * mss_now);
+
+ xmit_size_goal = min_t(u32, gso_size,
+ sk->sk_gso_max_size - 1 - hlen);
- /* TSQ : try to have two TSO segments in flight */
+ /* TSQ : try to have at least two segments in flight
+ * (one in NIC TX ring, another in Qdisc)
+ */
xmit_size_goal = min_t(u32, xmit_size_goal,
sysctl_tcp_limit_output_bytes >> 1);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4b75aad14b0..70883b87bc5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -699,6 +699,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
}
}
+/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
+ * Note: TCP stack does not yet implement pacing.
+ * FQ packet scheduler can be used to implement cheap but effective
+ * TCP pacing, to smooth the burst on large writes when packets
+ * in flight is significantly lower than cwnd (or rwin)
+ */
+static void tcp_update_pacing_rate(struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u64 rate;
+
+ /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
+ rate = (u64)tp->mss_cache * 2 * (HZ << 3);
+
+ rate *= max(tp->snd_cwnd, tp->packets_out);
+
+ /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
+ * be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
+ * We probably need usec resolution in the future.
+ * Note: This also takes care of possible srtt=0 case,
+ * when tcp_rtt_estimator() was not yet called.
+ */
+ if (tp->srtt > 8 + 2)
+ do_div(rate, tp->srtt);
+
+ sk->sk_pacing_rate = min_t(u64, rate, ~0U);
+}
+
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
@@ -3330,7 +3358,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
- u32 prior_in_flight;
+ u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
u32 prior_fackets;
int prior_packets = tp->packets_out;
int prior_sacked = tp->sacked_out;
@@ -3438,6 +3466,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
+ if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
+ tcp_update_pacing_rate(sk);
return 1;
no_queue:
@@ -5736,6 +5766,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
} else
tcp_init_metrics(sk);
+ tcp_update_pacing_rate(sk);
+
/* Prevent spurious tcp_cwnd_restart() on
* first data packet.
*/
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0145ce7e609..400b811f5c0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1623,7 +1623,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
/* If a full-sized TSO skb can be sent, do it. */
if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
- sk->sk_gso_max_segs * tp->mss_cache))
+ tp->xmit_size_goal_segs * tp->mss_cache))
goto send_now;
/* Middle in queue won't get any more data, full sendable already? */