From 05caf8dbc1880415df3378cfd114d832c9618b60 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Thu, 22 May 2008 15:13:29 +0200
Subject: block: Move the second call to get_request to the end of the loop

In function get_request_wait, the second call to get_request could be
moved to the end of the while loop, because if the first call to
get_request fails, the second call will fail without sleep.

Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 6a9cc0d22a6..1905aaba49f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -806,35 +806,32 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 	rq = get_request(q, rw_flags, bio, GFP_NOIO);
 	while (!rq) {
 		DEFINE_WAIT(wait);
+		struct io_context *ioc;
 		struct request_list *rl = &q->rq;
 
 		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
 				TASK_UNINTERRUPTIBLE);
 
-		rq = get_request(q, rw_flags, bio, GFP_NOIO);
-
-		if (!rq) {
-			struct io_context *ioc;
+		blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
 
-			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
-
-			__generic_unplug_device(q);
-			spin_unlock_irq(q->queue_lock);
-			io_schedule();
+		__generic_unplug_device(q);
+		spin_unlock_irq(q->queue_lock);
+		io_schedule();
 
-			/*
-			 * After sleeping, we become a "batching" process and
-			 * will be able to allocate at least one request, and
-			 * up to a big batch of them for a small period time.
-			 * See ioc_batching, ioc_set_batching
-			 */
-			ioc = current_io_context(GFP_NOIO, q->node);
-			ioc_set_batching(q, ioc);
+		/*
+		 * After sleeping, we become a "batching" process and
+		 * will be able to allocate at least one request, and
+		 * up to a big batch of them for a small period time.
+		 * See ioc_batching, ioc_set_batching
+		 */
+		ioc = current_io_context(GFP_NOIO, q->node);
+		ioc_set_batching(q, ioc);
 
-			spin_lock_irq(q->queue_lock);
-		}
+		spin_lock_irq(q->queue_lock);
 		finish_wait(&rl->wait[rw], &wait);
-	}
+
+		rq = get_request(q, rw_flags, bio, GFP_NOIO);
+	};
 
 	return rq;
 }
-- 
cgit v1.2.3


From be754d2c2161c0cce11d62727016985ecb76831b Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Fri, 23 May 2008 06:52:00 +0200
Subject: block: reorder cfq_queue to save space on 64bit builds

saves 8 bytes of padding & increases objects/slab from 30 to 32 on my
AMD64 config

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b399c62936e..4df3f052243 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -124,6 +124,8 @@ struct cfq_data {
 struct cfq_queue {
 	/* reference count */
 	atomic_t ref;
+	/* various state flags, see below */
+	unsigned int flags;
 	/* parent cfq_data */
 	struct cfq_data *cfqd;
 	/* service_tree member */
@@ -138,14 +140,14 @@ struct cfq_queue {
 	int queued[2];
 	/* currently allocated requests */
 	int allocated[2];
-	/* pending metadata requests */
-	int meta_pending;
 	/* fifo list of requests in sort_list */
 	struct list_head fifo;
 
 	unsigned long slice_end;
 	long slice_resid;
 
+	/* pending metadata requests */
+	int meta_pending;
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
 
@@ -153,8 +155,6 @@ struct cfq_queue {
 	unsigned short ioprio, org_ioprio;
 	unsigned short ioprio_class, org_ioprio_class;
 
-	/* various state flags, see below */
-	unsigned int flags;
 };
 
 enum cfqq_state_flags {
-- 
cgit v1.2.3


From 9d5f09a424a67ddb959829894efb4c71cbf6d600 Mon Sep 17 00:00:00 2001
From: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Date: Tue, 27 May 2008 14:54:41 +0200
Subject: Added in MESSAGE notes for blktraces

Allows messages to be inserted into blktrace streams.

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blktrace.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'block')

diff --git a/block/blktrace.c b/block/blktrace.c
index b2cbb4e5d76..20e11f354f1 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -75,6 +75,20 @@ static void trace_note_time(struct blk_trace *bt)
 	local_irq_restore(flags);
 }
 
+void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
+{
+	int n;
+	va_list args;
+	static char bt_msg_buf[BLK_TN_MAX_MSG];
+
+	va_start(args, fmt);
+	n = vscnprintf(bt_msg_buf, BLK_TN_MAX_MSG, fmt, args);
+	va_end(args);
+
+	trace_note(bt, 0, BLK_TN_MESSAGE, bt_msg_buf, n);
+}
+EXPORT_SYMBOL_GPL(__trace_note_message);
+
 static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 			 pid_t pid)
 {
-- 
cgit v1.2.3


From 4722dc52a891ab6cb2d637ddb87233e0ce277827 Mon Sep 17 00:00:00 2001
From: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Date: Tue, 27 May 2008 14:55:00 +0200
Subject: Added in elevator switch message to blktrace stream

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/elevator.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/elevator.c b/block/elevator.c
index 980f8ae147b..902dd1344d5 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -1110,6 +1110,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
 	spin_unlock_irq(q->queue_lock);
 
+	blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
+
 	return 1;
 
 fail_register:
-- 
cgit v1.2.3


From 64565911cdb57c2f512a9715b985b5617402cc67 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 28 May 2008 14:45:33 +0200
Subject: block: make blktrace use per-cpu buffers for message notes

Currently it uses a single static char array, but that risks
being corrupted when multiple users issue message notes at the
same time. Make the buffers dynamically allocated when the trace
is setup and make them per-cpu instead.

The default max message size of 1k is also very large, the
interface is mainly for small text notes. So shrink it to 128 bytes.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blktrace.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blktrace.c b/block/blktrace.c
index 20e11f354f1..7ae87cc4a16 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -79,13 +79,16 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 {
 	int n;
 	va_list args;
-	static char bt_msg_buf[BLK_TN_MAX_MSG];
+	char *buf;
 
+	preempt_disable();
+	buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
 	va_start(args, fmt);
-	n = vscnprintf(bt_msg_buf, BLK_TN_MAX_MSG, fmt, args);
+	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
 	va_end(args);
 
-	trace_note(bt, 0, BLK_TN_MESSAGE, bt_msg_buf, n);
+	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(__trace_note_message);
 
@@ -246,6 +249,7 @@ static void blk_trace_cleanup(struct blk_trace *bt)
 	debugfs_remove(bt->dropped_file);
 	blk_remove_tree(bt->dir);
 	free_percpu(bt->sequence);
+	free_percpu(bt->msg_data);
 	kfree(bt);
 }
 
@@ -360,6 +364,10 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->sequence)
 		goto err;
 
+	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG);
+	if (!bt->msg_data)
+		goto err;
+
 	ret = -ENOENT;
 	dir = blk_create_tree(buts->name);
 	if (!dir)
@@ -406,6 +414,7 @@ err:
 		if (bt->dropped_file)
 			debugfs_remove(bt->dropped_file);
 		free_percpu(bt->sequence);
+		free_percpu(bt->msg_data);
 		if (bt->rchan)
 			relay_close(bt->rchan);
 		kfree(bt);
-- 
cgit v1.2.3


From d6de8be711b28049a5cb93c954722c311c7d3f7f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 28 May 2008 14:46:59 +0200
Subject: cfq-iosched: fix RCU problem in cfq_cic_lookup()

cfq_cic_lookup() needs to properly protect ioc->ioc_data before
dereferencing it and also exclude updaters of ioc->ioc_data as well.

Also add a number of comments documenting why the existing RCU usage
is OK.

Thanks a lot to "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> for
review and comments!

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4df3f052243..d01b411c72f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1142,6 +1142,9 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	kmem_cache_free(cfq_pool, cfqq);
 }
 
+/*
+ * Must always be called with the rcu_read_lock() held
+ */
 static void
 __call_for_each_cic(struct io_context *ioc,
 		    void (*func)(struct io_context *, struct cfq_io_context *))
@@ -1197,6 +1200,11 @@ static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
 	cfq_cic_free(cic);
 }
 
+/*
+ * Must be called with rcu_read_lock() held or preemption otherwise disabled.
+ * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
+ * and ->trim() which is called with the task lock held
+ */
 static void cfq_free_io_context(struct io_context *ioc)
 {
 	/*
@@ -1502,20 +1510,24 @@ static struct cfq_io_context *
 cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 {
 	struct cfq_io_context *cic;
+	unsigned long flags;
 	void *k;
 
 	if (unlikely(!ioc))
 		return NULL;
 
+	rcu_read_lock();
+
 	/*
 	 * we maintain a last-hit cache, to avoid browsing over the tree
 	 */
 	cic = rcu_dereference(ioc->ioc_data);
-	if (cic && cic->key == cfqd)
+	if (cic && cic->key == cfqd) {
+		rcu_read_unlock();
 		return cic;
+	}
 
 	do {
-		rcu_read_lock();
 		cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);
 		rcu_read_unlock();
 		if (!cic)
@@ -1524,10 +1536,13 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 		k = cic->key;
 		if (unlikely(!k)) {
 			cfq_drop_dead_cic(cfqd, ioc, cic);
+			rcu_read_lock();
 			continue;
 		}
 
+		spin_lock_irqsave(&ioc->lock, flags);
 		rcu_assign_pointer(ioc->ioc_data, cic);
+		spin_unlock_irqrestore(&ioc->lock, flags);
 		break;
 	} while (1);
 
@@ -2134,6 +2149,10 @@ static void *cfq_init_queue(struct request_queue *q)
 
 static void cfq_slab_kill(void)
 {
+	/*
+	 * Caller already ensured that pending RCU callbacks are completed,
+	 * so we should have no busy allocations at this point.
+	 */
 	if (cfq_pool)
 		kmem_cache_destroy(cfq_pool);
 	if (cfq_ioc_pool)
@@ -2292,6 +2311,11 @@ static void __exit cfq_exit(void)
 	ioc_gone = &all_gone;
 	/* ioc_gone's update must be visible before reading ioc_count */
 	smp_wmb();
+
+	/*
+	 * this also protects us from entering cfq_slab_kill() with
+	 * pending RCU callbacks
+	 */
 	if (elv_ioc_count_read(ioc_count))
 		wait_for_completion(ioc_gone);
 	cfq_slab_kill();
-- 
cgit v1.2.3


From d5791d13b1d45542895104edf4b09476d5ad24b0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 9 Jun 2008 10:06:24 -0700
Subject: Fix invalid access errors in blk_lookup_devt

Commit 30f2f0eb4bd2c43d10a8b0d872c6e5ad8f31c9a0 ("block: do_mounts -
accept root=<non-existant partition>") extended blk_lookup_devt() to be
able to look up partitions that had not yet been registered, but in the
process made the assumption that the '&block_class.devices' list only
contains disk devices and that you can do 'dev_to_disk(dev)' on them.

That isn't actually true.  The block_class device list also contains the
partitions we've discovered so far, and you can't just do a
'dev_to_disk()' on those.

So make sure to only work on devices that block/genhd.c has registered
itself, something we can test by checking the 'dev->type' member.  This
makes the loop in blk_lookup_devt() match the other such loops in this
file.

[ We may want to do an alternate version that knows to handle _either_
  whole-disk devices or partitions, but for now this is the minimal fix
  for a series of crashes reported by Mariusz Kozlowski in

	http://lkml.org/lkml/2008/5/25/25

  and Ingo in

	http://lkml.org/lkml/2008/6/9/39 ]

Reported-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>
Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: Neil Brown <neilb@suse.de>
Cc: Joao Luis Meloni Assirati <assirati@nonada.if.usp.br>
Acked-by: Kay Sievers <kay.sievers@vrfy.org>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/genhd.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 129ad939f9d..b922d4801c8 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -660,6 +660,8 @@ dev_t blk_lookup_devt(const char *name, int part)
 
 	mutex_lock(&block_class_lock);
 	list_for_each_entry(dev, &block_class.devices, node) {
+		if (dev->type != &disk_type)
+			continue;
 		if (strcmp(dev->bus_id, name) == 0) {
 			struct gendisk *disk = dev_to_disk(dev);
 
-- 
cgit v1.2.3


From 14a73f54798f39854e521fb596da7d50b7566bbd Mon Sep 17 00:00:00 2001
From: Carl Henrik Lunde <chlunde@ping.uio.no>
Date: Thu, 12 Jun 2008 20:13:58 +0200
Subject: block: disable IRQs until data is written to relay channel

As we may run relay_reserve from interrupt context we must always disable
IRQs.  This is because a call to relay_reserve may expose previously written
data to use space.

Updated new message code and an old but related comment.

Signed-off-by: Carl Henrik Lunde <chlunde@ping.uio.no>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/blktrace.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blktrace.c b/block/blktrace.c
index 7ae87cc4a16..8d3a2778026 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -79,16 +79,17 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 {
 	int n;
 	va_list args;
+	unsigned long flags;
 	char *buf;
 
-	preempt_disable();
+	local_irq_save(flags);
 	buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
 	va_start(args, fmt);
 	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
 	va_end(args);
 
 	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
-	preempt_enable();
+	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(__trace_note_message);
 
@@ -158,10 +159,7 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	/*
 	 * A word about the locking here - we disable interrupts to reserve
 	 * some space in the relay per-cpu buffer, to prevent an irq
-	 * from coming in and stepping on our toes. Once reserved, it's
-	 * enough to get preemption disabled to prevent read of this data
-	 * before we are through filling it. get_cpu()/put_cpu() does this
-	 * for us
+	 * from coming in and stepping on our toes.
 	 */
 	local_irq_save(flags);
 
-- 
cgit v1.2.3