From a237c1c5bc5dc5c76a21be922dca4826f3eca8ca Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 16 Apr 2011 13:27:55 +0200 Subject: block: let io_schedule() flush the plug inline Linus correctly observes that the most important dispatch cases are now done from kblockd, this isn't ideal for latency reasons. The original reason for switching dispatches out-of-line was to avoid too deep a stack, so by _only_ letting the "accidental" flush directly in schedule() be guarded by offload to kblockd, we should be able to get the best of both worlds. So add a blk_schedule_flush_plug() that offloads to kblockd, and only use that from the schedule() path. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 13 +++++++++++++ kernel/sched.c | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1c76506fcf1..ec0357d8c4a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -871,6 +871,14 @@ static inline void blk_flush_plug(struct task_struct *tsk) { struct blk_plug *plug = tsk->plug; + if (plug) + blk_flush_plug_list(plug, false); +} + +static inline void blk_schedule_flush_plug(struct task_struct *tsk) +{ + struct blk_plug *plug = tsk->plug; + if (plug) blk_flush_plug_list(plug, true); } @@ -1317,6 +1325,11 @@ static inline void blk_flush_plug(struct task_struct *task) { } +static inline void blk_schedule_flush_plug(struct task_struct *task) +{ +} + + static inline bool blk_needs_flush_plug(struct task_struct *tsk) { return false; diff --git a/kernel/sched.c b/kernel/sched.c index a187c3fe027..312f8b95c2d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4118,7 +4118,7 @@ need_resched: */ if (blk_needs_flush_plug(prev)) { raw_spin_unlock(&rq->lock); - blk_flush_plug(prev); + blk_schedule_flush_plug(prev); raw_spin_lock(&rq->lock); } } -- cgit v1.2.3 From 49cac01e1fa74174d72adb0e872504a7fefd7c01 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 16 Apr 2011 13:51:05 +0200 Subject: block: make unplug timer trace event correspond to the schedule() unplug It's a pretty close match to what we had before - the timer triggering would mean that nobody unplugged the plug in due time, in the new scheme this matches very closely what the schedule() unplug now is. It's essentially the difference between an explicit unplug (IO unplug) or an implicit unplug (timer unplug, we scheduled with pending IO queued). Signed-off-by: Jens Axboe --- block/blk-core.c | 18 ++++++++++++------ include/trace/events/block.h | 13 +++++++------ kernel/trace/blktrace.c | 18 ++++++++++++------ 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 3c812107250..78b7b0cb721 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2662,17 +2662,23 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) return !(rqa->q <= rqb->q); } +/* + * If 'from_schedule' is true, then postpone the dispatch of requests + * until a safe kblockd context. We due this to avoid accidental big + * additional stack usage in driver dispatch, in places where the originally + * plugger did not intend it. + */ static void queue_unplugged(struct request_queue *q, unsigned int depth, - bool force_kblockd) + bool from_schedule) { - trace_block_unplug_io(q, depth); - __blk_run_queue(q, force_kblockd); + trace_block_unplug(q, depth, !from_schedule); + __blk_run_queue(q, from_schedule); if (q->unplugged_fn) q->unplugged_fn(q); } -void blk_flush_plug_list(struct blk_plug *plug, bool force_kblockd) +void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request_queue *q; unsigned long flags; @@ -2707,7 +2713,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool force_kblockd) BUG_ON(!rq->q); if (rq->q != q) { if (q) { - queue_unplugged(q, depth, force_kblockd); + queue_unplugged(q, depth, from_schedule); spin_unlock(q->queue_lock); } q = rq->q; @@ -2728,7 +2734,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool force_kblockd) } if (q) { - queue_unplugged(q, depth, force_kblockd); + queue_unplugged(q, depth, from_schedule); spin_unlock(q->queue_lock); } diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 006e60b5830..bf366547da2 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -401,9 +401,9 @@ TRACE_EVENT(block_plug, DECLARE_EVENT_CLASS(block_unplug, - TP_PROTO(struct request_queue *q, unsigned int depth), + TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit), - TP_ARGS(q, depth), + TP_ARGS(q, depth, explicit), TP_STRUCT__entry( __field( int, nr_rq ) @@ -419,18 +419,19 @@ DECLARE_EVENT_CLASS(block_unplug, ); /** - * block_unplug_io - release of operations requests in request queue + * block_unplug - release of operations requests in request queue * @q: request queue to unplug * @depth: number of requests just added to the queue + * @explicit: whether this was an explicit unplug, or one from schedule() * * Unplug request queue @q because device driver is scheduled to work * on elements in the request queue. */ -DEFINE_EVENT(block_unplug, block_unplug_io, +DEFINE_EVENT(block_unplug, block_unplug, - TP_PROTO(struct request_queue *q, unsigned int depth), + TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit), - TP_ARGS(q, depth) + TP_ARGS(q, depth, explicit) ); /** diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3e3970d53d1..6957aa298df 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -850,16 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); } -static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q, - unsigned int depth) +static void blk_add_trace_unplug(void *ignore, struct request_queue *q, + unsigned int depth, bool explicit) { struct blk_trace *bt = q->blk_trace; if (bt) { __be64 rpdu = cpu_to_be64(depth); + u32 what; - __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, - sizeof(rpdu), &rpdu); + if (explicit) + what = BLK_TA_UNPLUG_IO; + else + what = BLK_TA_UNPLUG_TIMER; + + __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); } } @@ -1002,7 +1007,7 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); - ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); + ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); WARN_ON(ret); ret = register_trace_block_split(blk_add_trace_split, NULL); WARN_ON(ret); @@ -1017,7 +1022,7 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); unregister_trace_block_split(blk_add_trace_split, NULL); - unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); + unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); @@ -1332,6 +1337,7 @@ static const struct { [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, + [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, -- cgit v1.2.3