From 9ad9e8d6ca29c1446d81c6518ae634a2141dfd22 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 29 Oct 2019 16:42:27 +0200 Subject: nvme-rdma: fix a segmentation fault during module unload In case there are controllers that are not associated with any RDMA device (e.g. during unsuccessful reconnection) and the user will unload the module, these controllers will not be freed and will access already freed memory. The same logic appears in other fabric drivers as well. Fixes: 87fd125344d6 ("nvme-rdma: remove redundant reference between ib_device and tagset") Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/rdma.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index f19a28b4e997..cb4c3000a57e 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2133,8 +2133,16 @@ err_unreg_client: static void __exit nvme_rdma_cleanup_module(void) { + struct nvme_rdma_ctrl *ctrl; + nvmf_unregister_transport(&nvme_rdma_transport); ib_unregister_client(&nvme_rdma_ib_client); + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) + nvme_delete_ctrl(&ctrl->ctrl); + mutex_unlock(&nvme_rdma_ctrl_mutex); + flush_workqueue(nvme_delete_wq); } module_init(nvme_rdma_init_module); -- cgit v1.2.3 From 763303a83a095a88c3a8a0d1abf97165db2e8bf5 Mon Sep 17 00:00:00 2001 From: Anton Eidelman Date: Fri, 1 Nov 2019 17:27:55 -0700 Subject: nvme-multipath: fix crash in nvme_mpath_clear_ctrl_paths nvme_mpath_clear_ctrl_paths() iterates through the ctrl->namespaces list while holding ctrl->scan_lock. This does not seem to be the correct way of protecting from concurrent list modification. Specifically, nvme_scan_work() sorts ctrl->namespaces AFTER unlocking scan_lock. This may result in the following (rare) crash in ctrl disconnect during scan_work: BUG: kernel NULL pointer dereference, address: 0000000000000050 Oops: 0000 [#1] SMP PTI CPU: 0 PID: 3995 Comm: nvme 5.3.5-050305-generic RIP: 0010:nvme_mpath_clear_current_path+0xe/0x90 [nvme_core] ... Call Trace: nvme_mpath_clear_ctrl_paths+0x3c/0x70 [nvme_core] nvme_remove_namespaces+0x35/0xe0 [nvme_core] nvme_do_delete_ctrl+0x47/0x90 [nvme_core] nvme_sysfs_delete+0x49/0x60 [nvme_core] dev_attr_store+0x17/0x30 sysfs_kf_write+0x3e/0x50 kernfs_fop_write+0x11e/0x1a0 __vfs_write+0x1b/0x40 vfs_write+0xb9/0x1a0 ksys_write+0x67/0xe0 __x64_sys_write+0x1a/0x20 do_syscall_64+0x5a/0x130 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f8d02bfb154 Fix: After taking scan_lock in nvme_mpath_clear_ctrl_paths() down_read(&ctrl->namespaces_rwsem) as well to make list traversal safe. This will not cause deadlocks because taking scan_lock never happens while holding the namespaces_rwsem. Moreover, scan work downs namespaces_rwsem in the same order. Alternative: sort ctrl->namespaces in nvme_scan_work() while still holding the scan_lock. This would leave nvme_mpath_clear_ctrl_paths() without correct protection against ctrl->namespaces modification by anyone other than scan_work. Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Anton Eidelman Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index fc99a40c1ec4..e0f064dcbd02 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -158,9 +158,11 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) struct nvme_ns *ns; mutex_lock(&ctrl->scan_lock); + down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) if (nvme_mpath_clear_current_path(ns)) kblockd_schedule_work(&ns->head->requeue_work); + up_read(&ctrl->namespaces_rwsem); mutex_unlock(&ctrl->scan_lock); } -- cgit v1.2.3 From 0d6eeb1fd625272bd60d25f2d5e116cf582fc7dc Mon Sep 17 00:00:00 2001 From: Charles Machalow Date: Mon, 4 Nov 2019 22:15:10 -0800 Subject: nvme: change nvme_passthru_cmd64 to explicitly mark rsvd Changing nvme_passthru_cmd64 to add a field: rsvd2. This field is an explicit marker for the padding space added on certain platforms as a result of the enlargement of the result field from 32 bit to 64 bits in size, and fixes differences in struct size when using compat ioctl for 32-bit binaries on 64-bit architecture. Fixes: 65e68edce0db ("nvme: allow 64-bit results in passthru commands") Reviewed-by: Christoph Hellwig Signed-off-by: Charles Machalow [changelog] Signed-off-by: Keith Busch --- include/uapi/linux/nvme_ioctl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h index e168dc59e9a0..d99b5a772698 100644 --- a/include/uapi/linux/nvme_ioctl.h +++ b/include/uapi/linux/nvme_ioctl.h @@ -63,6 +63,7 @@ struct nvme_passthru_cmd64 { __u32 cdw14; __u32 cdw15; __u32 timeout_ms; + __u32 rsvd2; __u64 result; }; -- cgit v1.2.3 From b0814361a25cba73a224548843ed92d8ea78715a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Nov 2019 08:09:51 -0800 Subject: blkcg: make blkcg_print_stat() print stats only for online blkgs blkcg_print_stat() iterates blkgs under RCU and doesn't test whether the blkg is online. This can call into pd_stat_fn() on a pd which is still being initialized leading to an oops. The heaviest operation - recursively summing up rwstat counters - is already done while holding the queue_lock. Expand queue_lock to cover the other operations and skip the blkg if it isn't online yet. The online state is protected by both blkcg and queue locks, so this guarantees that only online blkgs are processed. Signed-off-by: Tejun Heo Reported-by: Roman Gushchin Cc: Josef Bacik Fixes: 903d23f0a354 ("blk-cgroup: allow controllers to output their own stats") Cc: stable@vger.kernel.org # v4.19+ Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5d21027b1faf..1eb8895be4c6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -934,9 +934,14 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) int i; bool has_stats = false; + spin_lock_irq(&blkg->q->queue_lock); + + if (!blkg->online) + goto skip; + dname = blkg_dev_name(blkg); if (!dname) - continue; + goto skip; /* * Hooray string manipulation, count is the size written NOT @@ -946,8 +951,6 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) */ off += scnprintf(buf+off, size-off, "%s ", dname); - spin_lock_irq(&blkg->q->queue_lock); - blkg_rwstat_recursive_sum(blkg, NULL, offsetof(struct blkcg_gq, stat_bytes), &rwstat); rbytes = rwstat.cnt[BLKG_RWSTAT_READ]; @@ -960,8 +963,6 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) wios = rwstat.cnt[BLKG_RWSTAT_WRITE]; dios = rwstat.cnt[BLKG_RWSTAT_DISCARD]; - spin_unlock_irq(&blkg->q->queue_lock); - if (rbytes || wbytes || rios || wios) { has_stats = true; off += scnprintf(buf+off, size-off, @@ -999,6 +1000,8 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) seq_commit(sf, -1); } } + skip: + spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); -- cgit v1.2.3 From 8e9c523016cf9983b295e4bc659183d1fa6ef8e0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 7 Nov 2019 10:48:47 +0300 Subject: block: drbd: remove a stray unlock in __drbd_send_protocol() There are two callers of this function and they both unlock the mutex so this ends up being a double unlock. Fixes: 44ed167da748 ("drbd: rcu_read_lock() and rcu_dereference() for tconn->net_conf") Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 5b248763a672..a18155cdce41 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -786,7 +786,6 @@ int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cm if (nc->tentative && connection->agreed_pro_version < 92) { rcu_read_unlock(); - mutex_unlock(&sock->mutex); drbd_err(connection, "--dry-run is not supported by peer"); return -EOPNOTSUPP; } -- cgit v1.2.3 From 65de03e251382306a4575b1779c57c87889eee49 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 Nov 2019 12:18:29 -0800 Subject: cgroup,writeback: don't switch wbs immediately on dead wbs if the memcg is dead cgroup writeback tries to refresh the associated wb immediately if the current wb is dead. This is to avoid keeping issuing IOs on the stale wb after memcg - blkcg association has changed (ie. when blkcg got disabled / enabled higher up in the hierarchy). Unfortunately, the logic gets triggered spuriously on inodes which are associated with dead cgroups. When the logic is triggered on dead cgroups, the attempt fails only after doing quite a bit of work allocating and initializing a new wb. While c3aab9a0bd91 ("mm/filemap.c: don't initiate writeback if mapping has no dirty pages") alleviated the issue significantly as it now only triggers when the inode has dirty pages. However, the condition can still be triggered before the inode is switched to a different cgroup and the logic simply doesn't make sense. Skip the immediate switching if the associated memcg is dying. This is a simplified version of the following two patches: * https://lore.kernel.org/linux-mm/20190513183053.GA73423@dennisz-mbp/ * http://lkml.kernel.org/r/156355839560.2063.5265687291430814589.stgit@buzz Cc: Konstantin Khlebnikov Fixes: e8a7abf5a5bd ("writeback: disassociate inodes from dying bdi_writebacks") Acked-by: Dennis Zhou Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8461a6322039..335607b8c5c0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -576,10 +576,13 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, spin_unlock(&inode->i_lock); /* - * A dying wb indicates that the memcg-blkcg mapping has changed - * and a new wb is already serving the memcg. Switch immediately. + * A dying wb indicates that either the blkcg associated with the + * memcg changed or the associated memcg is dying. In the first + * case, a replacement wb should already be available and we should + * refresh the wb immediately. In the second case, trying to + * refresh will keep failing. */ - if (unlikely(wb_dying(wbc->wb))) + if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) inode_switch_wbs(inode, wbc->wb_id); } EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); -- cgit v1.2.3