From 59a252ff8c0f2fa32c896f69d56ae33e641ce7ad Mon Sep 17 00:00:00 2001
From: Greg Banks <gnb@sgi.com>
Date: Tue, 13 Jan 2009 21:26:35 +1100
Subject: knfsd: avoid overloading the CPU scheduler with enormous load
 averages

Avoid overloading the CPU scheduler with enormous load averages
when handling high call-rate NFS loads.  When the knfsd bottom half
is made aware of an incoming call by the socket layer, it tries to
choose an nfsd thread and wake it up.  As long as there are idle
threads, one will be woken up.

If there are lot of nfsd threads (a sensible configuration when
the server is disk-bound or is running an HSM), there will be many
more nfsd threads than CPUs to run them.  Under a high call-rate
low service-time workload, the result is that almost every nfsd is
runnable, but only a handful are actually able to run.  This situation
causes two significant problems:

1. The CPU scheduler takes over 10% of each CPU, which is robbing
   the nfsd threads of valuable CPU time.

2. At a high enough load, the nfsd threads starve userspace threads
   of CPU time, to the point where daemons like portmap and rpc.mountd
   do not schedule for tens of seconds at a time.  Clients attempting
   to mount an NFS filesystem timeout at the very first step (opening
   a TCP connection to portmap) because portmap cannot wake up from
   select() and call accept() in time.

Disclaimer: these effects were observed on a SLES9 kernel, modern
kernels' schedulers may behave more gracefully.

The solution is simple: keep in each svc_pool a counter of the number
of threads which have been woken but have not yet run, and do not wake
any more if that count reaches an arbitrary small threshold.

Testing was on a 4 CPU 4 NIC Altix using 4 IRIX clients, each with 16
synthetic client threads simulating an rsync (i.e. recursive directory
listing) workload reading from an i386 RH9 install image (161480
regular files in 10841 directories) on the server.  That tree is small
enough to fill in the server's RAM so no disk traffic was involved.
This setup gives a sustained call rate in excess of 60000 calls/sec
before being CPU-bound on the server.  The server was running 128 nfsds.

Profiling showed schedule() taking 6.7% of every CPU, and __wake_up()
taking 5.2%.  This patch drops those contributions to 3.0% and 2.2%.
Load average was over 120 before the patch, and 20.9 after.

This patch is a forward-ported version of knfsd-avoid-nfsd-overload
which has been shipping in the SGI "Enhanced NFS" product since 2006.
It has been posted before:

http://article.gmane.org/gmane.linux.nfs/10374

Signed-off-by: Greg Banks <gnb@sgi.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svc_xprt.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e588df5d6b34..0551b6b6cf8c 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -14,6 +14,8 @@
 
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
+#define SVC_MAX_WAKING 5
+
 static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt);
 static int svc_deferred_recv(struct svc_rqst *rqstp);
 static struct cache_deferred_req *svc_defer(struct cache_req *req);
@@ -298,6 +300,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 	struct svc_pool *pool;
 	struct svc_rqst	*rqstp;
 	int cpu;
+	int thread_avail;
 
 	if (!(xprt->xpt_flags &
 	      ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED))))
@@ -309,12 +312,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 
 	spin_lock_bh(&pool->sp_lock);
 
-	if (!list_empty(&pool->sp_threads) &&
-	    !list_empty(&pool->sp_sockets))
-		printk(KERN_ERR
-		       "svc_xprt_enqueue: "
-		       "threads and transports both waiting??\n");
-
 	if (test_bit(XPT_DEAD, &xprt->xpt_flags)) {
 		/* Don't enqueue dead transports */
 		dprintk("svc: transport %p is dead, not enqueued\n", xprt);
@@ -353,7 +350,14 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 	}
 
  process:
-	if (!list_empty(&pool->sp_threads)) {
+	/* Work out whether threads are available */
+	thread_avail = !list_empty(&pool->sp_threads);	/* threads are asleep */
+	if (pool->sp_nwaking >= SVC_MAX_WAKING) {
+		/* too many threads are runnable and trying to wake up */
+		thread_avail = 0;
+	}
+
+	if (thread_avail) {
 		rqstp = list_entry(pool->sp_threads.next,
 				   struct svc_rqst,
 				   rq_list);
@@ -368,6 +372,8 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 		svc_xprt_get(xprt);
 		rqstp->rq_reserved = serv->sv_max_mesg;
 		atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+		rqstp->rq_waking = 1;
+		pool->sp_nwaking++;
 		BUG_ON(xprt->xpt_pool != pool);
 		wake_up(&rqstp->rq_wait);
 	} else {
@@ -633,6 +639,11 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 		return -EINTR;
 
 	spin_lock_bh(&pool->sp_lock);
+	if (rqstp->rq_waking) {
+		rqstp->rq_waking = 0;
+		pool->sp_nwaking--;
+		BUG_ON(pool->sp_nwaking < 0);
+	}
 	xprt = svc_xprt_dequeue(pool);
 	if (xprt) {
 		rqstp->rq_xprt = xprt;
-- 
cgit v1.2.3


From 03cf6c9f49a8fea953d38648d016e3f46e814991 Mon Sep 17 00:00:00 2001
From: Greg Banks <gnb@sgi.com>
Date: Tue, 13 Jan 2009 21:26:36 +1100
Subject: knfsd: add file to export stats about nfsd pools

Add /proc/fs/nfsd/pool_stats to export to userspace various
statistics about the operation of rpc server thread pools.

This patch is based on a forward-ported version of
knfsd-add-pool-thread-stats which has been shipping in the SGI
"Enhanced NFS" product since 2006 and which was previously
posted:

http://article.gmane.org/gmane.linux.nfs/10375

It has also been updated thus:

 * moved EXPORT_SYMBOL() to near the function it exports
 * made the new struct struct seq_operations const
 * used SEQ_START_TOKEN instead of ((void *)1)
 * merged fix from SGI PV 990526 "sunrpc: use dprintk instead of
   printk in svc_pool_stats_*()" by Harshula Jayasuriya.
 * merged fix from SGI PV 964001 "Crash reading pool_stats before
   nfsds are started".

Signed-off-by: Greg Banks <gnb@sgi.com>
Signed-off-by: Harshula Jayasuriya <harshula@sgi.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svc_xprt.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 99 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 0551b6b6cf8c..1e66f2491460 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -318,6 +318,8 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 		goto out_unlock;
 	}
 
+	pool->sp_stats.packets++;
+
 	/* Mark transport as busy. It will remain in this state until
 	 * the provider calls svc_xprt_received. We update XPT_BUSY
 	 * atomically because it also guards against trying to enqueue
@@ -355,6 +357,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 	if (pool->sp_nwaking >= SVC_MAX_WAKING) {
 		/* too many threads are runnable and trying to wake up */
 		thread_avail = 0;
+		pool->sp_stats.overloads_avoided++;
 	}
 
 	if (thread_avail) {
@@ -374,11 +377,13 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
 		atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
 		rqstp->rq_waking = 1;
 		pool->sp_nwaking++;
+		pool->sp_stats.threads_woken++;
 		BUG_ON(xprt->xpt_pool != pool);
 		wake_up(&rqstp->rq_wait);
 	} else {
 		dprintk("svc: transport %p put into queue\n", xprt);
 		list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
+		pool->sp_stats.sockets_queued++;
 		BUG_ON(xprt->xpt_pool != pool);
 	}
 
@@ -591,6 +596,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 	int			pages;
 	struct xdr_buf		*arg;
 	DECLARE_WAITQUEUE(wait, current);
+	long			time_left;
 
 	dprintk("svc: server %p waiting for data (to = %ld)\n",
 		rqstp, timeout);
@@ -676,12 +682,14 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 		add_wait_queue(&rqstp->rq_wait, &wait);
 		spin_unlock_bh(&pool->sp_lock);
 
-		schedule_timeout(timeout);
+		time_left = schedule_timeout(timeout);
 
 		try_to_freeze();
 
 		spin_lock_bh(&pool->sp_lock);
 		remove_wait_queue(&rqstp->rq_wait, &wait);
+		if (!time_left)
+			pool->sp_stats.threads_timedout++;
 
 		xprt = rqstp->rq_xprt;
 		if (!xprt) {
@@ -1114,3 +1122,93 @@ int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen)
 	return totlen;
 }
 EXPORT_SYMBOL_GPL(svc_xprt_names);
+
+
+/*----------------------------------------------------------------------------*/
+
+static void *svc_pool_stats_start(struct seq_file *m, loff_t *pos)
+{
+	unsigned int pidx = (unsigned int)*pos;
+	struct svc_serv *serv = m->private;
+
+	dprintk("svc_pool_stats_start, *pidx=%u\n", pidx);
+
+	lock_kernel();
+	/* bump up the pseudo refcount while traversing */
+	svc_get(serv);
+	unlock_kernel();
+
+	if (!pidx)
+		return SEQ_START_TOKEN;
+	return (pidx > serv->sv_nrpools ? NULL : &serv->sv_pools[pidx-1]);
+}
+
+static void *svc_pool_stats_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct svc_pool *pool = p;
+	struct svc_serv *serv = m->private;
+
+	dprintk("svc_pool_stats_next, *pos=%llu\n", *pos);
+
+	if (p == SEQ_START_TOKEN) {
+		pool = &serv->sv_pools[0];
+	} else {
+		unsigned int pidx = (pool - &serv->sv_pools[0]);
+		if (pidx < serv->sv_nrpools-1)
+			pool = &serv->sv_pools[pidx+1];
+		else
+			pool = NULL;
+	}
+	++*pos;
+	return pool;
+}
+
+static void svc_pool_stats_stop(struct seq_file *m, void *p)
+{
+	struct svc_serv *serv = m->private;
+
+	lock_kernel();
+	/* this function really, really should have been called svc_put() */
+	svc_destroy(serv);
+	unlock_kernel();
+}
+
+static int svc_pool_stats_show(struct seq_file *m, void *p)
+{
+	struct svc_pool *pool = p;
+
+	if (p == SEQ_START_TOKEN) {
+		seq_puts(m, "# pool packets-arrived sockets-enqueued threads-woken overloads-avoided threads-timedout\n");
+		return 0;
+	}
+
+	seq_printf(m, "%u %lu %lu %lu %lu %lu\n",
+		pool->sp_id,
+		pool->sp_stats.packets,
+		pool->sp_stats.sockets_queued,
+		pool->sp_stats.threads_woken,
+		pool->sp_stats.overloads_avoided,
+		pool->sp_stats.threads_timedout);
+
+	return 0;
+}
+
+static const struct seq_operations svc_pool_stats_seq_ops = {
+	.start	= svc_pool_stats_start,
+	.next	= svc_pool_stats_next,
+	.stop	= svc_pool_stats_stop,
+	.show	= svc_pool_stats_show,
+};
+
+int svc_pool_stats_open(struct svc_serv *serv, struct file *file)
+{
+	int err;
+
+	err = seq_open(file, &svc_pool_stats_seq_ops);
+	if (!err)
+		((struct seq_file *) file->private_data)->private = serv;
+	return err;
+}
+EXPORT_SYMBOL(svc_pool_stats_open);
+
+/*----------------------------------------------------------------------------*/
-- 
cgit v1.2.3


From 47a14ef1af48c696b214ac168f056ddc79793d0e Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <aglo@citi.umich.edu>
Date: Tue, 21 Oct 2008 14:13:47 -0400
Subject: svcrpc: take advantage of tcp autotuning

Allow the NFSv4 server to make use of TCP autotuning behaviour, which
was previously disabled by setting the sk_userlocks variable.

Set the receive buffers to be big enough to receive the whole RPC
request, and set this for the listening socket, not the accept socket.

Remove the code that readjusts the receive/send buffer sizes for the
accepted socket. Previously this code was used to influence the TCP
window management behaviour, which is no longer needed when autotuning
is enabled.

This can improve IO bandwidth on networks with high bandwidth-delay
products, where a large tcp window is required.  It also simplifies
performance tuning, since getting adequate tcp buffers previously
required increasing the number of nfsd threads.

Signed-off-by: Olga Kornievskaia <aglo@citi.umich.edu>
Cc: Jim Rees <rees@umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svcsock.c | 35 +++++++----------------------------
 1 file changed, 7 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 5763e6460fea..7a2a90fb2e06 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -345,7 +345,6 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
 	lock_sock(sock->sk);
 	sock->sk->sk_sndbuf = snd * 2;
 	sock->sk->sk_rcvbuf = rcv * 2;
-	sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
 	release_sock(sock->sk);
 #endif
 }
@@ -797,23 +796,6 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 		test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags),
 		test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
 
-	if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
-		/* sndbuf needs to have room for one request
-		 * per thread, otherwise we can stall even when the
-		 * network isn't a bottleneck.
-		 *
-		 * We count all threads rather than threads in a
-		 * particular pool, which provides an upper bound
-		 * on the number of threads which will access the socket.
-		 *
-		 * rcvbuf just needs to be able to hold a few requests.
-		 * Normally they will be removed from the queue
-		 * as soon a a complete request arrives.
-		 */
-		svc_sock_setbufsize(svsk->sk_sock,
-				    (serv->sv_nrthreads+3) * serv->sv_max_mesg,
-				    3 * serv->sv_max_mesg);
-
 	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
 	/* Receive data. If we haven't got the record length yet, get
@@ -1061,15 +1043,6 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 
 		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
 
-		/* initialise setting must have enough space to
-		 * receive and respond to one request.
-		 * svc_tcp_recvfrom will re-adjust if necessary
-		 */
-		svc_sock_setbufsize(svsk->sk_sock,
-				    3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
-				    3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
-
-		set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
 		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 		if (sk->sk_state != TCP_ESTABLISHED)
 			set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
@@ -1140,8 +1113,14 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 	/* Initialize the socket */
 	if (sock->type == SOCK_DGRAM)
 		svc_udp_init(svsk, serv);
-	else
+	else {
+		/* initialise setting must have enough space to
+		 * receive and respond to one request.
+		 */
+		svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg,
+					4 * serv->sv_max_mesg);
 		svc_tcp_init(svsk, serv);
+	}
 
 	/*
 	 * We start one listener per sv_serv.  We want AF_INET
-- 
cgit v1.2.3


From abd91ee979f785b7377216532620d98ab4e3e5af Mon Sep 17 00:00:00 2001
From: ideawu <ideawu@163.com>
Date: Thu, 26 Mar 2009 12:55:29 +0800
Subject: sunrpc/svc.c: Remove unused line 'rqstp->rq_server = serv;' in
 svc_process

There is no need to set rqstp->rq_server to serv, while serv is initialized as rqstp->rq_server at previous line. And between these two lines, there is no change to rqstp->rq_server.

Signed-off-by: ideawu <ideawu@163.com>
Reviewed-by: Tom Tucker <tom@opengridcomputing.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svc.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c51fed4d1af1..fff09a2d8960 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1093,7 +1093,6 @@ svc_process(struct svc_rqst *rqstp)
 	procp = versp->vs_proc + proc;
 	if (proc >= versp->vs_nproc || !procp->pc_func)
 		goto err_bad_proc;
-	rqstp->rq_server   = serv;
 	rqstp->rq_procinfo = procp;
 
 	/* Syntactic check complete */
-- 
cgit v1.2.3


From 2f425878b6a71571341dcd3f9e9d1a6f6355da9c Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 3 Apr 2009 08:27:32 +0300
Subject: nfsd: don't use the deferral service, return NFS4ERR_DELAY

On an NFSv4.1 server cache miss that causes an upcall, NFS4ERR_DELAY will be
returned. It is up to the NFSv4.1 client to resend only the operations that
have not been processed.

Initialize rq_usedeferral to 1 in svc_process(). It sill be turned off in
nfsd4_proc_compound() only when NFSv4.1 Sessions are used.

Note: this isn't an adequate solution on its own. It's acceptable as a way
to get some minimal 4.1 up and working, but we're going to have to find a
way to avoid returning DELAY in all common cases before 4.1 can really be
considered ready.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[nfsd41: reverse rq_nodeferral negative logic]
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
[sunrpc: initialize rq_usedeferral]
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svc.c      | 2 ++
 net/sunrpc/svc_xprt.c | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index fff09a2d8960..45984cbe1bfa 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1023,6 +1023,8 @@ svc_process(struct svc_rqst *rqstp)
 	rqstp->rq_res.tail[0].iov_len = 0;
 	/* Will be turned off only in gss privacy case: */
 	rqstp->rq_splice_ok = 1;
+	/* Will be turned off only when NFSv4 Sessions are used */
+	rqstp->rq_usedeferral = 1;
 
 	/* Setup reply header */
 	rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 1e66f2491460..600d0918e3ae 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -974,7 +974,7 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req)
 	struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
 	struct svc_deferred_req *dr;
 
-	if (rqstp->rq_arg.page_len)
+	if (rqstp->rq_arg.page_len || !rqstp->rq_usedeferral)
 		return NULL; /* if more than a page, give up FIXME */
 	if (rqstp->rq_deferred) {
 		dr = rqstp->rq_deferred;
-- 
cgit v1.2.3