From 4b70ac5fd9b58bfaa5f25b4ea48f528aefbf3308 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 30 Apr 2014 19:02:48 +0200 Subject: aio: change exit_aio() to load mm->ioctx_table once and avoid rcu_read_lock() On 04/30, Benjamin LaHaise wrote: > > > - ctx->mmap_size = 0; > > - > > - kill_ioctx(mm, ctx, NULL); > > + if (ctx) { > > + ctx->mmap_size = 0; > > + kill_ioctx(mm, ctx, NULL); > > + } > > Rather than indenting and moving the two lines changing mmap_size and the > kill_ioctx() call, why not just do "if (!ctx) ... continue;"? That reduces > the number of lines changed and avoid excessive indentation. OK. To me the code looks better/simpler with "if (ctx)", but this is subjective of course, I won't argue. The patch still removes the empty line between mmap_size = 0 and kill_ioctx(), we reset mmap_size only for kill_ioctx(). But feel free to remove this change. ------------------------------------------------------------------------------- Subject: [PATCH v3 1/2] aio: change exit_aio() to load mm->ioctx_table once and avoid rcu_read_lock() 1. We can read ->ioctx_table only once and we do not read rcu_read_lock() or even rcu_dereference(). This mm has no users, nobody else can play with ->ioctx_table. Otherwise the code is buggy anyway, if we need rcu_read_lock() in a loop because ->ioctx_table can be updated then kfree(table) is obviously wrong. 2. Update the comment. "exit_mmap(mm) is coming" is the good reason to avoid munmap(), but another reason is that we simply can't do vm_munmap() unless current->mm == mm and this is not true in general, the caller is mmput(). 3. We do not really need to nullify mm->ioctx_table before return, probably the current code does this to catch the potential problems. But in this case RCU_INIT_POINTER(NULL) looks better. Signed-off-by: Oleg Nesterov Signed-off-by: Benjamin LaHaise --- fs/aio.c | 42 ++++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 955947ef3e02..b6696462e345 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -791,40 +791,30 @@ EXPORT_SYMBOL(wait_on_sync_kiocb); */ void exit_aio(struct mm_struct *mm) { - struct kioctx_table *table; - struct kioctx *ctx; - unsigned i = 0; - - while (1) { - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); - - do { - if (!table || i >= table->nr) { - rcu_read_unlock(); - rcu_assign_pointer(mm->ioctx_table, NULL); - if (table) - kfree(table); - return; - } + struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); + int i; - ctx = table->table[i++]; - } while (!ctx); + if (!table) + return; - rcu_read_unlock(); + for (i = 0; i < table->nr; ++i) { + struct kioctx *ctx = table->table[i]; + if (!ctx) + continue; /* - * We don't need to bother with munmap() here - - * exit_mmap(mm) is coming and it'll unmap everything. - * Since aio_free_ring() uses non-zero ->mmap_size - * as indicator that it needs to unmap the area, - * just set it to 0; aio_free_ring() is the only - * place that uses ->mmap_size, so it's safe. + * We don't need to bother with munmap() here - exit_mmap(mm) + * is coming and it'll unmap everything. And we simply can't, + * this is not necessarily our ->mm. + * Since kill_ioctx() uses non-zero ->mmap_size as indicator + * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx, NULL); } + + RCU_INIT_POINTER(mm->ioctx_table, NULL); + kfree(table); } static void put_reqs_available(struct kioctx *ctx, unsigned nr) -- cgit v1.2.3 From 855ef0dec7271ff7be7381feaaf3f4aed80bd503 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 30 Apr 2014 16:16:36 +0200 Subject: aio: kill the misleading rcu read locks in ioctx_add_table() and kill_ioctx() ioctx_add_table() is the writer, it does not need rcu_read_lock() to protect ->ioctx_table. It relies on mm->ioctx_lock and rcu locks just add the confusion. And it doesn't need rcu_dereference() by the same reason, it must see any updates previously done under the same ->ioctx_lock. We could use rcu_dereference_protected() but the patch uses rcu_dereference_raw(), the function is simple enough. The same for kill_ioctx(), although it does not update the pointer. Signed-off-by: Oleg Nesterov Signed-off-by: Benjamin LaHaise --- fs/aio.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index b6696462e345..c1d8c480c138 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -554,8 +554,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) struct aio_ring *ring; spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); + table = rcu_dereference_raw(mm->ioctx_table); while (1) { if (table) @@ -563,7 +562,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) if (!table->table[i]) { ctx->id = i; table->table[i] = ctx; - rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); /* While kioctx setup is in progress, @@ -577,8 +575,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) } new_nr = (table ? table->nr : 1) * 4; - - rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * @@ -589,8 +585,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) table->nr = new_nr; spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - old = rcu_dereference(mm->ioctx_table); + old = rcu_dereference_raw(mm->ioctx_table); if (!old) { rcu_assign_pointer(mm->ioctx_table, table); @@ -737,12 +732,9 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); - + table = rcu_dereference_raw(mm->ioctx_table); WARN_ON(ctx != table->table[ctx->id]); table->table[ctx->id] = NULL; - rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); /* percpu_ref_kill() will do the necessary call_rcu() */ -- cgit v1.2.3 From be6fb451a24582732c66e28cb0beb3f19c4289fd Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 22 Jul 2014 09:56:56 -0400 Subject: aio: remove no longer needed preempt_disable() Based on feedback from Jens Axboe on 263782c1c95bbddbb022dc092fd89a36bb8d5577, clean up get/put_reqs_available() to remove the no longer needed preempt_disable() and preempt_enable() pair. Signed-off-by: Benjamin LaHaise Cc: Jens Axboe --- fs/aio.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 8216aa0c7539..9ce9e8ea773f 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -814,10 +814,8 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr) struct kioctx_cpu *kcpu; unsigned long flags; - preempt_disable(); - kcpu = this_cpu_ptr(ctx->cpu); - local_irq_save(flags); + kcpu = this_cpu_ptr(ctx->cpu); kcpu->reqs_available += nr; while (kcpu->reqs_available >= ctx->req_batch * 2) { @@ -826,7 +824,6 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr) } local_irq_restore(flags); - preempt_enable(); } static bool get_reqs_available(struct kioctx *ctx) @@ -835,10 +832,8 @@ static bool get_reqs_available(struct kioctx *ctx) bool ret = false; unsigned long flags; - preempt_disable(); - kcpu = this_cpu_ptr(ctx->cpu); - local_irq_save(flags); + kcpu = this_cpu_ptr(ctx->cpu); if (!kcpu->reqs_available) { int old, avail = atomic_read(&ctx->reqs_available); @@ -858,7 +853,6 @@ static bool get_reqs_available(struct kioctx *ctx) kcpu->reqs_available--; out: local_irq_restore(flags); - preempt_enable(); return ret; } -- cgit v1.2.3 From b53f1f82fbde8dcf34ab7d731c2a9ae6f0d8d2e2 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 23 Jul 2014 18:03:51 +0800 Subject: aio: remove the needless registration of ring file's private_data Remove the registration of ring file's private_data, we do not use it. Reviewed-by: Jeff Moyer Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/aio.c b/fs/aio.c index 9ce9e8ea773f..55e03a858e8b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -192,7 +192,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) } file->f_flags = O_RDWR; - file->private_data = ctx; return file; } -- cgit v1.2.3 From 8dc4379e17cddad7b2088a3f300ded50d2a6d493 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 23 Jul 2014 18:03:52 +0800 Subject: aio: use the macro rather than the inline magic number Replace the inline magic number with the ready-made macro(AIO_RING_MAGIC), just clean up. Reviewed-by: Jeff Moyer Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/aio.c b/fs/aio.c index 55e03a858e8b..6fc6b9857a58 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -201,7 +201,7 @@ static struct dentry *aio_mount(struct file_system_type *fs_type, static const struct dentry_operations ops = { .d_dname = simple_dname, }; - return mount_pseudo(fs_type, "aio:", NULL, &ops, 0xa10a10a1); + return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC); } /* aio_setup -- cgit v1.2.3 From 2be4e7deec2d4398a0eb2165cc04086ebfc831d2 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 23 Jul 2014 18:03:53 +0800 Subject: aio: fix some comments The function comments of aio_run_iocb and aio_read_events are out of date, so fix them here. Reviewed-by: Jeff Moyer Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 6fc6b9857a58..d6d9520f6866 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1020,7 +1020,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) } EXPORT_SYMBOL(aio_complete); -/* aio_read_events +/* aio_read_events_ring * Pull an event off of the ioctx's event ring. Returns the number of * events fetched */ @@ -1272,9 +1272,8 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb, } /* - * aio_setup_iocb: - * Performs the initial checks and aio retry method - * setup for the kiocb at the time of io submission. + * aio_run_iocb: + * Performs the initial checks and io submission. */ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, char __user *buf, bool compat) -- cgit v1.2.3 From 00fefb9cf2b5493a86912de55ba912bdfae4a207 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 23 Jul 2014 18:03:54 +0800 Subject: aio: use iovec array rather than the single one Previously, we only offer a single iovec to handle all the read/write cases, so the PREADV/PWRITEV request always need to alloc more iovec buffer when copying user vectors. If we use a tmp iovec array rather than the single one, some small PREADV/PWRITEV workloads(vector size small than the tmp buffer) will not need to alloc more iovec buffer when copying user vectors. Reviewed-by: Jeff Moyer Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index d6d9520f6866..0fd9181d8c0c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1243,12 +1243,12 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, if (compat) ret = compat_rw_copy_check_uvector(rw, (struct compat_iovec __user *)buf, - *nr_segs, 1, *iovec, iovec); + *nr_segs, UIO_FASTIOV, *iovec, iovec); else #endif ret = rw_copy_check_uvector(rw, (struct iovec __user *)buf, - *nr_segs, 1, *iovec, iovec); + *nr_segs, UIO_FASTIOV, *iovec, iovec); if (ret < 0) return ret; @@ -1285,7 +1285,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, fmode_t mode; aio_rw_op *rw_op; rw_iter_op *iter_op; - struct iovec inline_vec, *iovec = &inline_vec; + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; switch (opcode) { @@ -1320,7 +1320,7 @@ rw_common: if (!ret) ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); if (ret < 0) { - if (iovec != &inline_vec) + if (iovec != inline_vecs) kfree(iovec); return ret; } @@ -1367,7 +1367,7 @@ rw_common: return -EINVAL; } - if (iovec != &inline_vec) + if (iovec != inline_vecs) kfree(iovec); if (ret != -EIOCBQUEUED) { -- cgit v1.2.3