diff options
Diffstat (limited to 'fs')
308 files changed, 5127 insertions, 2297 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index a7e28890f5ef..c30c6ceac2c4 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -282,32 +282,28 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, switch (handler->flags) { case ACL_TYPE_ACCESS: if (acl) { - umode_t mode = inode->i_mode; - retval = posix_acl_equiv_mode(acl, &mode); - if (retval < 0) + struct iattr iattr; + struct posix_acl *old_acl = acl; + + retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); + if (retval) goto err_out; - else { - struct iattr iattr; - if (retval == 0) { - /* - * ACL can be represented - * by the mode bits. So don't - * update ACL. - */ - acl = NULL; - value = NULL; - size = 0; - } - /* Updte the mode bits */ - iattr.ia_mode = ((mode & S_IALLUGO) | - (inode->i_mode & ~S_IALLUGO)); - iattr.ia_valid = ATTR_MODE; - /* FIXME should we update ctime ? - * What is the following setxattr update the - * mode ? + if (!acl) { + /* + * ACL can be represented + * by the mode bits. So don't + * update ACL. */ - v9fs_vfs_setattr_dotl(dentry, &iattr); + posix_acl_release(old_acl); + value = NULL; + size = 0; } + iattr.ia_valid = ATTR_MODE; + /* FIXME should we update ctime ? + * What is the following setxattr update the + * mode ? + */ + v9fs_vfs_setattr_dotl(dentry, &iattr); } break; case ACL_TYPE_DEFAULT: diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 7bf835f85bc8..12ceaf52dae6 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -74,7 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) v9fs_proto_dotu(v9ses)); fid = file->private_data; if (!fid) { - fid = v9fs_fid_clone(file->f_path.dentry); + fid = v9fs_fid_clone(file_dentry(file)); if (IS_ERR(fid)) return PTR_ERR(fid); @@ -100,7 +100,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) * because we want write after unlink usecase * to work. */ - fid = v9fs_writeback_fid(file->f_path.dentry); + fid = v9fs_writeback_fid(file_dentry(file)); if (IS_ERR(fid)) { err = PTR_ERR(fid); mutex_unlock(&v9inode->v_mutex); @@ -516,7 +516,7 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) * because we want write after unlink usecase * to work. */ - fid = v9fs_writeback_fid(filp->f_path.dentry); + fid = v9fs_writeback_fid(file_dentry(filp)); if (IS_ERR(fid)) { retval = PTR_ERR(fid); mutex_unlock(&v9inode->v_mutex); diff --git a/fs/affs/super.c b/fs/affs/super.c index 5b50c4ca43a7..f90c535703ce 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -528,7 +528,7 @@ affs_remount(struct super_block *sb, int *flags, char *data) char *prefix = NULL; new_opts = kstrdup(data, GFP_KERNEL); - if (!new_opts) + if (data && !new_opts) return -ENOMEM; pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data); @@ -546,7 +546,8 @@ affs_remount(struct super_block *sb, int *flags, char *data) } flush_delayed_work(&sbi->sb_work); - replace_mount_options(sb, new_opts); + if (new_opts) + replace_mount_options(sb, new_opts); sbi->s_flags = mount_flags; sbi->s_mode = mode; @@ -239,7 +239,12 @@ static struct dentry *aio_mount(struct file_system_type *fs_type, static const struct dentry_operations ops = { .d_dname = simple_dname, }; - return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC); + struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, &ops, + AIO_RING_MAGIC); + + if (!IS_ERR(root)) + root->d_sb->s_iflags |= SB_I_NOEXEC; + return root; } /* aio_setup diff --git a/fs/attr.c b/fs/attr.c index 6530ced19697..d62f674a605f 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -202,6 +202,21 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de return -EPERM; } + /* + * If utimes(2) and friends are called with times == NULL (or both + * times are UTIME_NOW), then we need to check for write permission + */ + if (ia_valid & ATTR_TOUCH) { + if (IS_IMMUTABLE(inode)) + return -EPERM; + + if (!inode_owner_or_capable(inode)) { + error = inode_permission(inode, MAY_WRITE); + if (error) + return error; + } + } + if ((ia_valid & ATTR_MODE)) { umode_t amode = attr->ia_mode; /* Flag setting protected by i_mutex */ diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index c37149b929be..502d3892d8a4 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -79,9 +79,13 @@ struct autofs_info { }; #define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ -#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered +#define AUTOFS_INF_WANT_EXPIRE (1<<1) /* the dentry is being considered * for expiry, so RCU_walk is - * not permitted + * not permitted. If it progresses to + * actual expiry attempt, the flag is + * not cleared when EXPIRING is set - + * in that case it gets cleared only + * when it comes to clearing EXPIRING. */ #define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index ac7d921ed984..257425511d10 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -331,7 +331,7 @@ static int autofs_dev_ioctl_fail(struct file *fp, int status; token = (autofs_wqt_t) param->fail.token; - status = param->fail.status ? param->fail.status : -ENOENT; + status = param->fail.status < 0 ? param->fail.status : -ENOENT; return autofs4_wait_release(sbi, token, status); } diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 1cebc3c52fa5..7a5a598a2d94 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -315,19 +315,17 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, if (ino->flags & AUTOFS_INF_PENDING) goto out; if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { - ino->flags |= AUTOFS_INF_NO_RCU; + ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); spin_lock(&sbi->fs_lock); if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { ino->flags |= AUTOFS_INF_EXPIRING; - smp_mb(); - ino->flags &= ~AUTOFS_INF_NO_RCU; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); return root; } - ino->flags &= ~AUTOFS_INF_NO_RCU; + ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; } out: spin_unlock(&sbi->fs_lock); @@ -417,6 +415,7 @@ static struct dentry *should_expire(struct dentry *dentry, } return NULL; } + /* * Find an eligible tree to time-out * A tree is eligible if :- @@ -432,6 +431,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, struct dentry *root = sb->s_root; struct dentry *dentry; struct dentry *expired; + struct dentry *found; struct autofs_info *ino; if (!root) @@ -442,48 +442,54 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, dentry = NULL; while ((dentry = get_next_positive_subdir(dentry, root))) { + int flags = how; + spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); - if (ino->flags & AUTOFS_INF_NO_RCU) - expired = NULL; - else - expired = should_expire(dentry, mnt, timeout, how); - if (!expired) { + if (ino->flags & AUTOFS_INF_WANT_EXPIRE) { spin_unlock(&sbi->fs_lock); continue; } + spin_unlock(&sbi->fs_lock); + + expired = should_expire(dentry, mnt, timeout, flags); + if (!expired) + continue; + + spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(expired); - ino->flags |= AUTOFS_INF_NO_RCU; + ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); - spin_lock(&sbi->fs_lock); - if (should_expire(expired, mnt, timeout, how)) { - if (expired != dentry) - dput(dentry); - goto found; - } - ino->flags &= ~AUTOFS_INF_NO_RCU; + /* Make sure a reference is not taken on found if + * things have changed. + */ + flags &= ~AUTOFS_EXP_LEAVES; + found = should_expire(expired, mnt, timeout, how); + if (!found || found != expired) + /* Something has changed, continue */ + goto next; + if (expired != dentry) - dput(expired); + dput(dentry); + + spin_lock(&sbi->fs_lock); + goto found; +next: + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); + if (expired != dentry) + dput(expired); } return NULL; found: DPRINTK("returning %p %pd", expired, expired); ino->flags |= AUTOFS_INF_EXPIRING; - smp_mb(); - ino->flags &= ~AUTOFS_INF_NO_RCU; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); - spin_lock(&sbi->lookup_lock); - spin_lock(&expired->d_parent->d_lock); - spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&expired->d_parent->d_subdirs, &expired->d_child); - spin_unlock(&expired->d_lock); - spin_unlock(&expired->d_parent->d_lock); - spin_unlock(&sbi->lookup_lock); return expired; } @@ -492,15 +498,27 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; + int state; /* Block on any pending expire */ - if (!(ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))) + if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE)) return 0; if (rcu_walk) return -ECHILD; +retry: spin_lock(&sbi->fs_lock); - if (ino->flags & AUTOFS_INF_EXPIRING) { + state = ino->flags & (AUTOFS_INF_WANT_EXPIRE | AUTOFS_INF_EXPIRING); + if (state == AUTOFS_INF_WANT_EXPIRE) { + spin_unlock(&sbi->fs_lock); + /* + * Possibly being selected for expire, wait until + * it's selected or not. + */ + schedule_timeout_uninterruptible(HZ/10); + goto retry; + } + if (state & AUTOFS_INF_EXPIRING) { spin_unlock(&sbi->fs_lock); DPRINTK("waiting for expire %p name=%pd", dentry, dentry); @@ -551,7 +569,7 @@ int autofs4_expire_run(struct super_block *sb, ino = autofs4_dentry_ino(dentry); /* avoid rapid-fire expire attempts if expiry fails */ ino->last_used = now; - ino->flags &= ~AUTOFS_INF_EXPIRING; + ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); @@ -579,7 +597,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, spin_lock(&sbi->fs_lock); /* avoid rapid-fire expire attempts if expiry fails */ ino->last_used = now; - ino->flags &= ~AUTOFS_INF_EXPIRING; + ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); dput(dentry); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index c6d7d3dbd52a..7a54c6a867c8 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -455,7 +455,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) * a mount-trap. */ struct inode *inode; - if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU)) + if (ino->flags & AUTOFS_INF_WANT_EXPIRE) return 0; if (d_mountpoint(dentry)) return 0; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3a93755e880f..f44e93d2650d 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -905,17 +905,60 @@ static int load_elf_binary(struct linux_binprm *bprm) elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; vaddr = elf_ppnt->p_vaddr; + /* + * If we are loading ET_EXEC or we have already performed + * the ET_DYN load_addr calculations, proceed normally. + */ if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { elf_flags |= MAP_FIXED; } else if (loc->elf_ex.e_type == ET_DYN) { - /* Try and get dynamic programs out of the way of the - * default mmap base, as well as whatever program they - * might try to exec. This is because the brk will - * follow the loader, and is not movable. */ - load_bias = ELF_ET_DYN_BASE - vaddr; - if (current->flags & PF_RANDOMIZE) - load_bias += arch_mmap_rnd(); - load_bias = ELF_PAGESTART(load_bias); + /* + * This logic is run once for the first LOAD Program + * Header for ET_DYN binaries to calculate the + * randomization (load_bias) for all the LOAD + * Program Headers, and to calculate the entire + * size of the ELF mapping (total_size). (Note that + * load_addr_set is set to true later once the + * initial mapping is performed.) + * + * There are effectively two types of ET_DYN + * binaries: programs (i.e. PIE: ET_DYN with INTERP) + * and loaders (ET_DYN without INTERP, since they + * _are_ the ELF interpreter). The loaders must + * be loaded away from programs since the program + * may otherwise collide with the loader (especially + * for ET_EXEC which does not have a randomized + * position). For example to handle invocations of + * "./ld.so someprog" to test out a new version of + * the loader, the subsequent program that the + * loader loads must avoid the loader itself, so + * they cannot share the same load range. Sufficient + * room for the brk must be allocated with the + * loader as well, since brk must be available with + * the loader. + * + * Therefore, programs are loaded offset from + * ELF_ET_DYN_BASE and loaders are loaded into the + * independently randomized mmap region (0 load_bias + * without MAP_FIXED). + */ + if (elf_interpreter) { + load_bias = ELF_ET_DYN_BASE; + if (current->flags & PF_RANDOMIZE) + load_bias += arch_mmap_rnd(); + elf_flags |= MAP_FIXED; + } else + load_bias = 0; + + /* + * Since load_bias is used for all subsequent loading + * calculations, we must lower it by the first vaddr + * so that the remaining calculations based on the + * ELF vaddrs will be correctly offset. The result + * is then page aligned. + */ + load_bias = ELF_PAGESTART(load_bias - vaddr); + total_size = total_mapping_size(elf_phdata, loc->elf_ex.e_phnum); if (!total_size) { @@ -2295,6 +2338,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; } } + dump_truncate(cprm); if (!elf_core_write_extra_data(cprm)) goto end_coredump; diff --git a/fs/block_dev.c b/fs/block_dev.c index 44d4a1e9244e..26bbaaefdff4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -88,12 +88,11 @@ void invalidate_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0) - return; - - invalidate_bh_lrus(); - lru_add_drain_all(); /* make sure all lru add caches are flushed */ - invalidate_mapping_pages(mapping, 0, -1); + if (mapping->nrpages) { + invalidate_bh_lrus(); + lru_add_drain_all(); /* make sure all lru add caches are flushed */ + invalidate_mapping_pages(mapping, 0, -1); + } /* 99% of the time, we don't need to flush the cleancache on the bdev. * But, for the strange corners, lets be cautious */ @@ -759,7 +758,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, return true; /* already a holder */ else if (bdev->bd_holder != NULL) return false; /* held by someone else */ - else if (bdev->bd_contains == bdev) + else if (whole == bdev) return true; /* is a whole device which isn't held */ else if (whole->bd_holder == bd_may_claim) @@ -1098,7 +1097,6 @@ int revalidate_disk(struct gendisk *disk) if (disk->fops->revalidate_disk) ret = disk->fops->revalidate_disk(disk); - blk_integrity_revalidate(disk); bdev = bdget_disk(disk, 0); if (!bdev) return ret; @@ -1806,6 +1804,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { struct address_space *mapping = inode->i_mapping; + struct block_device *bdev; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || @@ -1826,8 +1825,12 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) */ iput(old_inode); old_inode = inode; + bdev = I_BDEV(inode); - func(I_BDEV(inode), arg); + mutex_lock(&bdev->bd_mutex); + if (bdev->bd_openers) + func(bdev, arg); + mutex_unlock(&bdev->bd_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); } diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 9a0124a95851..fb3e64d37cb4 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -83,11 +83,9 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; if (acl) { - ret = posix_acl_equiv_mode(acl, &inode->i_mode); - if (ret < 0) + ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (ret) return ret; - if (ret == 0) - acl = NULL; } ret = 0; break; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 9aba42b78253..a09264d8b853 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -70,6 +70,20 @@ void btrfs_##name(struct work_struct *arg) \ normal_work_helper(work); \ } +bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq) +{ + /* + * We could compare wq->normal->pending with num_online_cpus() + * to support "thresh == NO_THRESHOLD" case, but it requires + * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's + * postpone it until someone needs the support of that case. + */ + if (wq->normal->thresh == NO_THRESHOLD) + return false; + + return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2; +} + BTRFS_WORK_HELPER(worker_helper); BTRFS_WORK_HELPER(delalloc_helper); BTRFS_WORK_HELPER(flush_delalloc_helper); diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index ad4d0647d1a6..8e1d6576d764 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -80,4 +80,5 @@ void btrfs_queue_work(struct btrfs_workqueue *wq, void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); void btrfs_set_work_high_priority(struct btrfs_work *work); +bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq); #endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c473c42d7d6c..bae05c5c75ba 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -694,7 +694,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); if (ret) { - bio->bi_error = ret; + comp_bio->bi_error = ret; bio_endio(comp_bio); } @@ -723,7 +723,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); if (ret) { - bio->bi_error = ret; + comp_bio->bi_error = ret; bio_endio(comp_bio); } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5b8e235c4b6d..0f2b7c622ce3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1551,6 +1551,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid, root->fs_info->generation); if (!should_cow_block(trans, root, buf)) { + trans->dirty = true; *cow_ret = buf; return 0; } @@ -2773,8 +2774,10 @@ again: * then we don't want to set the path blocking, * so we test it here */ - if (!should_cow_block(trans, root, b)) + if (!should_cow_block(trans, root, b)) { + trans->dirty = true; goto cow_done; + } /* * must have write locks on this node and the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 385b449fd7ed..e847573c6db0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1770,6 +1770,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *qgroup_rescan_workers; struct completion qgroup_rescan_completion; struct btrfs_work qgroup_rescan_work; + bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */ /* filesystem state */ unsigned long fs_state; @@ -3069,6 +3070,8 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, cpu->target = le64_to_cpu(disk->target); cpu->flags = le64_to_cpu(disk->flags); cpu->limit = le64_to_cpu(disk->limit); + cpu->stripes_min = le32_to_cpu(disk->stripes_min); + cpu->stripes_max = le32_to_cpu(disk->stripes_max); } static inline void @@ -3087,6 +3090,8 @@ btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, disk->target = cpu_to_le64(cpu->target); disk->flags = cpu_to_le64(cpu->flags); disk->limit = cpu_to_le64(cpu->limit); + disk->stripes_min = cpu_to_le32(cpu->stripes_min); + disk->stripes_max = cpu_to_le32(cpu->stripes_max); } /* struct btrfs_super_block */ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 02b934d0ee65..09fa5af9782e 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1375,7 +1375,8 @@ release_path: total_done++; btrfs_release_prepared_delayed_node(delayed_node); - if (async_work->nr == 0 || total_done < async_work->nr) + if ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) || + total_done < async_work->nr) goto again; free_path: @@ -1391,7 +1392,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, { struct btrfs_async_delayed_work *async_work; - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND || + btrfs_workqueue_normal_congested(fs_info->delayed_workers)) return 0; async_work = kmalloc(sizeof(*async_work), GFP_NOFS); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 41fb43183406..85b207d19aa5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2276,6 +2276,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; fs_info->qgroup_ulist = NULL; + fs_info->qgroup_rescan_running = false; mutex_init(&fs_info->qgroup_rescan_lock); } @@ -3811,7 +3812,7 @@ void close_ctree(struct btrfs_root *root) smp_mb(); /* wait for the qgroup rescan worker to stop */ - btrfs_qgroup_wait_for_completion(fs_info); + btrfs_qgroup_wait_for_completion(fs_info, false); /* wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2368cac1115a..c36a03fa7678 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2520,11 +2520,11 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (ref && ref->seq && btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { spin_unlock(&locked_ref->lock); - btrfs_delayed_ref_unlock(locked_ref); spin_lock(&delayed_refs->lock); locked_ref->processing = 0; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; cond_resched(); count++; @@ -2570,7 +2570,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, */ if (must_insert_reserved) locked_ref->must_insert_reserved = 1; + spin_lock(&delayed_refs->lock); locked_ref->processing = 0; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); btrfs_delayed_ref_unlock(locked_ref); return ret; @@ -3851,6 +3854,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, info->space_info_kobj, "%s", alloc_name(found->flags)); if (ret) { + percpu_counter_destroy(&found->total_bytes_pinned); kfree(found); return ret; } @@ -7856,7 +7860,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } - trans->blocks_used++; + trans->dirty = true; /* this returns a buffer locked for blocking */ return buf; } @@ -8486,14 +8490,13 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, &wc->refs[level - 1], &wc->flags[level - 1]); - if (ret < 0) { - btrfs_tree_unlock(next); - return ret; - } + if (ret < 0) + goto out_unlock; if (unlikely(wc->refs[level - 1] == 0)) { btrfs_err(root->fs_info, "Missing references."); - BUG(); + ret = -EIO; + goto out_unlock; } *lookup_info = 0; @@ -8545,7 +8548,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } level--; - BUG_ON(level != btrfs_header_level(next)); + ASSERT(level == btrfs_header_level(next)); + if (level != btrfs_header_level(next)) { + btrfs_err(root->fs_info, "mismatched level"); + ret = -EIO; + goto out_unlock; + } path->nodes[level] = next; path->slots[level] = 0; path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; @@ -8560,8 +8568,15 @@ skip: if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { parent = path->nodes[level]->start; } else { - BUG_ON(root->root_key.objectid != + ASSERT(root->root_key.objectid == btrfs_header_owner(path->nodes[level])); + if (root->root_key.objectid != + btrfs_header_owner(path->nodes[level])) { + btrfs_err(root->fs_info, + "mismatched block owner"); + ret = -EIO; + goto out_unlock; + } parent = 0; } @@ -8578,12 +8593,18 @@ skip: } ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, root->root_key.objectid, level - 1, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + goto out_unlock; } + + *lookup_info = 1; + ret = 1; + +out_unlock: btrfs_tree_unlock(next); free_extent_buffer(next); - *lookup_info = 1; - return 1; + + return ret; } /* @@ -9686,6 +9707,11 @@ int btrfs_read_block_groups(struct btrfs_root *root) struct extent_buffer *leaf; int need_clear = 0; u64 cache_gen; + u64 feature; + int mixed; + + feature = btrfs_super_incompat_flags(info->super_copy); + mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); root = info->extent_root; key.objectid = 0; @@ -9739,6 +9765,15 @@ int btrfs_read_block_groups(struct btrfs_root *root) btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); cache->flags = btrfs_block_group_flags(&cache->item); + if (!mixed && + ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && + (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { + btrfs_err(info, +"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", + cache->key.objectid); + ret = -EINVAL; + goto error; + } key.objectid = found_key.objectid + found_key.offset; btrfs_release_path(path); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9abe18763a7f..e767f347f2b1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2786,12 +2786,6 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) btrfs_bio->csum = NULL; btrfs_bio->csum_allocated = NULL; btrfs_bio->end_io = NULL; - -#ifdef CONFIG_BLK_CGROUP - /* FIXME, put this into bio_clone_bioset */ - if (bio->bi_css) - bio_associate_blkcg(new, bio->bi_css); -#endif } return new; } @@ -5300,11 +5294,20 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, lock_page(page); } locked_pages++; + } + /* + * We need to firstly lock all pages to make sure that + * the uptodate bit of our pages won't be affected by + * clear_extent_buffer_uptodate(). + */ + for (i = start_i; i < num_pages; i++) { + page = eb->pages[i]; if (!PageUptodate(page)) { num_reads++; all_uptodate = 0; } } + if (all_uptodate) { if (start_i == 0) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0f09526aa7d9..d4a6eef31854 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1526,27 +1526,24 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC)) { - ret = check_can_nocow(inode, pos, &write_bytes); - if (ret < 0) - break; - if (ret > 0) { - /* - * For nodata cow case, no need to reserve - * data space. - */ - only_release_metadata = true; - /* - * our prealloc extent may be smaller than - * write_bytes, so scale down. - */ - num_pages = DIV_ROUND_UP(write_bytes + offset, - PAGE_CACHE_SIZE); - reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - goto reserve_metadata; - } + if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) && + check_can_nocow(inode, pos, &write_bytes) > 0) { + /* + * For nodata cow case, no need to reserve + * data space. + */ + only_release_metadata = true; + /* + * our prealloc extent may be smaller than + * write_bytes, so scale down. + */ + num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_CACHE_SIZE); + reserve_bytes = num_pages << PAGE_CACHE_SHIFT; + goto reserve_metadata; } + ret = btrfs_check_data_free_space(inode, pos, write_bytes); if (ret < 0) break; @@ -1885,7 +1882,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) */ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { - struct dentry *dentry = file->f_path.dentry; + struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -2774,7 +2771,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (!ret) ret = btrfs_prealloc_file_range(inode, mode, range->start, - range->len, 1 << inode->i_blkbits, + range->len, i_blocksize(inode), offset + len, &alloc_hint); list_del(&range->list); kfree(range); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4bc9dbf29a73..bebd6517355d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4397,8 +4397,19 @@ search_again: if (found_type > min_type) { del_item = 1; } else { - if (item_end < new_size) + if (item_end < new_size) { + /* + * With NO_HOLES mode, for the following mapping + * + * [0-4k][hole][8k-12k] + * + * if truncating isize down to 6k, it ends up + * isize being 8k. + */ + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) + last_size = new_size; break; + } if (found_key.offset >= new_size) del_item = 1; else @@ -7318,8 +7329,8 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) int found = false; void **pagep = NULL; struct page *page = NULL; - int start_idx; - int end_idx; + unsigned long start_idx; + unsigned long end_idx; start_idx = start >> PAGE_CACHE_SHIFT; @@ -7510,11 +7521,18 @@ static void adjust_dio_outstanding_extents(struct inode *inode, * within our reservation, otherwise we need to adjust our inode * counter appropriately. */ - if (dio_data->outstanding_extents) { + if (dio_data->outstanding_extents >= num_extents) { dio_data->outstanding_extents -= num_extents; } else { + /* + * If dio write length has been split due to no large enough + * contiguous space, we need to compensate our inode counter + * appropriately. + */ + u64 num_needed = num_extents - dio_data->outstanding_extents; + spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents += num_extents; + BTRFS_I(inode)->outstanding_extents += num_needed; spin_unlock(&BTRFS_I(inode)->lock); } } @@ -8691,9 +8709,14 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, * So even we call qgroup_free_data(), it won't decrease reserved * space. * 2) Not written to disk - * This means the reserved space should be freed here. + * This means the reserved space should be freed here. However, + * if a truncate invalidates the page (by clearing PageDirty) + * and the page is accounted for while allocating extent + * in btrfs_check_data_free_space() we let delayed_ref to + * free the entire extent. */ - btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE); + if (PageDirty(page)) + btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f07d01bc4875..317b99acdf4b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1619,6 +1619,9 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, int namelen; int ret = 0; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + ret = mnt_want_write_file(file); if (ret) goto out; @@ -1648,7 +1651,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, src_inode = file_inode(src.file); if (src_inode->i_sb != file_inode(file)->i_sb) { - btrfs_info(BTRFS_I(src_inode)->root->fs_info, + btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); ret = -EXDEV; } else if (!inode_owner_or_capable(src_inode)) { @@ -1676,6 +1679,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, struct btrfs_ioctl_vol_args *vol_args; int ret; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -1699,6 +1705,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, bool readonly = false; struct btrfs_qgroup_inherit *inherit = NULL; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -2345,6 +2354,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, int ret; int err = 0; + if (!S_ISDIR(dir->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -3813,6 +3825,11 @@ process_slot: } btrfs_release_path(path); key.offset = next_key_min_offset; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } } ret = 0; @@ -5121,7 +5138,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - return btrfs_qgroup_wait_for_completion(root->fs_info); + return btrfs_qgroup_wait_for_completion(root->fs_info, true); } static long _btrfs_ioctl_set_received_subvol(struct file *file, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5279fdae7142..88d9b66e2207 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -995,7 +995,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, goto out; fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; - btrfs_qgroup_wait_for_completion(fs_info); + btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; fs_info->quota_root = NULL; @@ -2349,6 +2349,9 @@ out: } done: + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = false; + mutex_unlock(&fs_info->qgroup_rescan_lock); complete_all(&fs_info->qgroup_rescan_completion); } @@ -2390,6 +2393,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, sizeof(fs_info->qgroup_rescan_progress)); fs_info->qgroup_rescan_progress.objectid = progress_objectid; init_completion(&fs_info->qgroup_rescan_completion); + fs_info->qgroup_rescan_running = true; spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -2467,20 +2471,26 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info) +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, + bool interruptible) { int running; int ret = 0; mutex_lock(&fs_info->qgroup_rescan_lock); spin_lock(&fs_info->qgroup_lock); - running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN; + running = fs_info->qgroup_rescan_running; spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); - if (running) + if (!running) + return 0; + + if (interruptible) ret = wait_for_completion_interruptible( &fs_info->qgroup_rescan_completion); + else + wait_for_completion(&fs_info->qgroup_rescan_completion); return ret; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index ecb2c143ef75..3d73e4c9c7df 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -46,7 +46,8 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, + bool interruptible); int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b4ca5454ef1a..8ca9aa92972d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -921,9 +921,16 @@ again: path2->slots[level]--; eb = path2->nodes[level]; - WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != - cur->bytenr); - + if (btrfs_node_blockptr(eb, path2->slots[level]) != + cur->bytenr) { + btrfs_err(root->fs_info, + "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", + cur->bytenr, level - 1, root->objectid, + node_key->objectid, node_key->type, + node_key->offset); + err = -ENOENT; + goto out; + } lower = cur; need_check = true; for (; level < BTRFS_MAX_LEVEL; level++) { @@ -2343,6 +2350,10 @@ void free_reloc_roots(struct list_head *list) while (!list_empty(list)) { reloc_root = list_entry(list->next, struct btrfs_root, root_list); + free_extent_buffer(reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + reloc_root->node = NULL; + reloc_root->commit_root = NULL; __del_reloc_root(reloc_root); } } @@ -2676,11 +2687,15 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!upper->eb) { ret = btrfs_search_slot(trans, root, key, path, 0, 1); - if (ret < 0) { - err = ret; + if (ret) { + if (ret < 0) + err = ret; + else + err = -ENOENT; + + btrfs_release_path(path); break; } - BUG_ON(ret > 0); if (!upper->eb) { upper->eb = path->nodes[upper->level]; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index fe609b81dd1b..3bd2233737ac 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -239,7 +239,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ - if (!trans->blocks_used && list_empty(&trans->new_bgs)) { + if (!trans->dirty && list_empty(&trans->new_bgs)) { const char *errstr; errstr = btrfs_decode_error(errno); @@ -1727,6 +1727,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } + btrfs_qgroup_rescan_resume(fs_info); + if (!fs_info->uuid_root) { btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 64c8221b6165..1e872923ec2c 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -110,7 +110,6 @@ struct btrfs_trans_handle { u64 chunk_bytes_reserved; unsigned long use_count; unsigned long blocks_reserved; - unsigned long blocks_used; unsigned long delayed_ref_updates; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; @@ -121,6 +120,7 @@ struct btrfs_trans_handle { bool can_flush_pending_bgs; bool reloc_reserved; bool sync; + bool dirty; unsigned int type; /* * this root is only needed to validate that the root passed to diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 323e12cc9d2f..ee7832e2d39d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1923,12 +1923,11 @@ static noinline int find_dir_range(struct btrfs_root *root, next: /* check the next slot in the tree to see if it is a valid item */ nritems = btrfs_header_nritems(path->nodes[0]); + path->slots[0]++; if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); if (ret) goto out; - } else { - path->slots[0]++; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -2696,14 +2695,12 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, int index, int error) { struct btrfs_log_ctx *ctx; + struct btrfs_log_ctx *safe; - if (!error) { - INIT_LIST_HEAD(&root->log_ctxs[index]); - return; - } - - list_for_each_entry(ctx, &root->log_ctxs[index], list) + list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { + list_del_init(&ctx->list); ctx->log_ret = error; + } INIT_LIST_HEAD(&root->log_ctxs[index]); } @@ -2850,6 +2847,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { blk_finish_plug(&plug); + list_del_init(&root_log_ctx.list); mutex_unlock(&log_root_tree->log_mutex); ret = root_log_ctx.log_ret; goto out; @@ -2943,13 +2941,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); out_wake_log_root: - /* - * We needn't get log_mutex here because we are sure all - * the other tasks are blocked. - */ + mutex_lock(&log_root_tree->log_mutex); btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); - mutex_lock(&log_root_tree->log_mutex); log_root_tree->log_transid_committed++; atomic_set(&log_root_tree->log_commit[index2], 0); mutex_unlock(&log_root_tree->log_mutex); @@ -2960,10 +2954,8 @@ out_wake_log_root: if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) wake_up(&log_root_tree->log_commit_wait[index2]); out: - /* See above. */ - btrfs_remove_all_log_ctxs(root, index1, ret); - mutex_lock(&root->log_mutex); + btrfs_remove_all_log_ctxs(root, index1, ret); root->log_transid_committed++; atomic_set(&root->log_commit[index1], 0); mutex_unlock(&root->log_mutex); @@ -4406,6 +4398,127 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, return ret; } +/* + * When we are logging a new inode X, check if it doesn't have a reference that + * matches the reference from some other inode Y created in a past transaction + * and that was renamed in the current transaction. If we don't do this, then at + * log replay time we can lose inode Y (and all its files if it's a directory): + * + * mkdir /mnt/x + * echo "hello world" > /mnt/x/foobar + * sync + * mv /mnt/x /mnt/y + * mkdir /mnt/x # or touch /mnt/x + * xfs_io -c fsync /mnt/x + * <power fail> + * mount fs, trigger log replay + * + * After the log replay procedure, we would lose the first directory and all its + * files (file foobar). + * For the case where inode Y is not a directory we simply end up losing it: + * + * echo "123" > /mnt/foo + * sync + * mv /mnt/foo /mnt/bar + * echo "abc" > /mnt/foo + * xfs_io -c fsync /mnt/foo + * <power fail> + * + * We also need this for cases where a snapshot entry is replaced by some other + * entry (file or directory) otherwise we end up with an unreplayable log due to + * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as + * if it were a regular entry: + * + * mkdir /mnt/x + * btrfs subvolume snapshot /mnt /mnt/x/snap + * btrfs subvolume delete /mnt/x/snap + * rmdir /mnt/x + * mkdir /mnt/x + * fsync /mnt/x or fsync some new file inside it + * <power fail> + * + * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in + * the same transaction. + */ +static int btrfs_check_ref_name_override(struct extent_buffer *eb, + const int slot, + const struct btrfs_key *key, + struct inode *inode) +{ + int ret; + struct btrfs_path *search_path; + char *name = NULL; + u32 name_len = 0; + u32 item_size = btrfs_item_size_nr(eb, slot); + u32 cur_offset = 0; + unsigned long ptr = btrfs_item_ptr_offset(eb, slot); + + search_path = btrfs_alloc_path(); + if (!search_path) + return -ENOMEM; + search_path->search_commit_root = 1; + search_path->skip_locking = 1; + + while (cur_offset < item_size) { + u64 parent; + u32 this_name_len; + u32 this_len; + unsigned long name_ptr; + struct btrfs_dir_item *di; + + if (key->type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *iref; + + iref = (struct btrfs_inode_ref *)(ptr + cur_offset); + parent = key->offset; + this_name_len = btrfs_inode_ref_name_len(eb, iref); + name_ptr = (unsigned long)(iref + 1); + this_len = sizeof(*iref) + this_name_len; + } else { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)(ptr + + cur_offset); + parent = btrfs_inode_extref_parent(eb, extref); + this_name_len = btrfs_inode_extref_name_len(eb, extref); + name_ptr = (unsigned long)&extref->name; + this_len = sizeof(*extref) + this_name_len; + } + + if (this_name_len > name_len) { + char *new_name; + + new_name = krealloc(name, this_name_len, GFP_NOFS); + if (!new_name) { + ret = -ENOMEM; + goto out; + } + name_len = this_name_len; + name = new_name; + } + + read_extent_buffer(eb, name, name_ptr, this_name_len); + di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root, + search_path, parent, + name, this_name_len, 0); + if (di && !IS_ERR(di)) { + ret = 1; + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + btrfs_release_path(search_path); + + cur_offset += this_len; + } + ret = 0; +out: + btrfs_free_path(search_path); + kfree(name); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4578,6 +4691,22 @@ again: if (min_key.type == BTRFS_INODE_ITEM_KEY) need_log_inode_item = false; + if ((min_key.type == BTRFS_INODE_REF_KEY || + min_key.type == BTRFS_INODE_EXTREF_KEY) && + BTRFS_I(inode)->generation == trans->transid) { + ret = btrfs_check_ref_name_override(path->nodes[0], + path->slots[0], + &min_key, inode); + if (ret < 0) { + err = ret; + goto out_unlock; + } else if (ret > 0) { + err = 1; + btrfs_set_log_full_commit(root->fs_info, trans); + goto out_unlock; + } + } + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ if (min_key.type == BTRFS_XATTR_ITEM_KEY) { if (ins_nr == 0) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9c62a6f9757a..600c67ef8a03 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -108,7 +108,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { }, }; -const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { +const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, diff --git a/fs/buffer.c b/fs/buffer.c index 4f4cd959da7c..6f7d519a093b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2298,7 +2298,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, loff_t pos, loff_t *bytes) { struct inode *inode = mapping->host; - unsigned blocksize = 1 << inode->i_blkbits; + unsigned int blocksize = i_blocksize(inode); struct page *page; void *fsdata; pgoff_t index, curidx; @@ -2378,8 +2378,8 @@ int cont_write_begin(struct file *file, struct address_space *mapping, get_block_t *get_block, loff_t *bytes) { struct inode *inode = mapping->host; - unsigned blocksize = 1 << inode->i_blkbits; - unsigned zerofrom; + unsigned int blocksize = i_blocksize(inode); + unsigned int zerofrom; int err; err = cont_expand_zero(file, mapping, pos, bytes); @@ -2741,7 +2741,7 @@ int nobh_truncate_page(struct address_space *mapping, struct buffer_head map_bh; int err; - blocksize = 1 << inode->i_blkbits; + blocksize = i_blocksize(inode); length = offset & (blocksize - 1); /* Block boundary? Nothing to do */ @@ -2819,7 +2819,7 @@ int block_truncate_page(struct address_space *mapping, struct buffer_head *bh; int err; - blocksize = 1 << inode->i_blkbits; + blocksize = i_blocksize(inode); length = offset & (blocksize - 1); /* Block boundary? Nothing to do */ @@ -2931,7 +2931,7 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block, struct inode *inode = mapping->host; tmp.b_state = 0; tmp.b_blocknr = 0; - tmp.b_size = 1 << inode->i_blkbits; + tmp.b_size = i_blocksize(inode); get_block(inode, block, &tmp, 0); return tmp.b_blocknr; } diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 8f84646f10e9..bdb9c94335f1 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -94,11 +94,9 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; if (acl) { - ret = posix_acl_equiv_mode(acl, &new_mode); - if (ret < 0) + ret = posix_acl_update_mode(inode, &new_mode, &acl); + if (ret) goto out; - if (ret == 0) - acl = NULL; } break; case ACL_TYPE_DEFAULT: @@ -130,7 +128,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (new_mode != old_mode) { newattrs.ia_mode = new_mode; newattrs.ia_valid = ATTR_MODE; - ret = ceph_setattr(dentry, &newattrs); + ret = __ceph_setattr(dentry, &newattrs); if (ret) goto out_dput; } @@ -140,7 +138,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (new_mode != old_mode) { newattrs.ia_mode = old_mode; newattrs.ia_valid = ATTR_MODE; - ceph_setattr(dentry, &newattrs); + __ceph_setattr(dentry, &newattrs); } goto out_dput; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b7d218a168fb..22bae2b434e2 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -189,7 +189,7 @@ static int ceph_releasepage(struct page *page, gfp_t g) /* * read a single page, without unlocking it. */ -static int readpage_nounlock(struct file *filp, struct page *page) +static int ceph_do_readpage(struct file *filp, struct page *page) { struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); @@ -219,7 +219,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) err = ceph_readpage_from_fscache(inode, page); if (err == 0) - goto out; + return -EINPROGRESS; dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); @@ -249,8 +249,11 @@ out: static int ceph_readpage(struct file *filp, struct page *page) { - int r = readpage_nounlock(filp, page); - unlock_page(page); + int r = ceph_do_readpage(filp, page); + if (r != -EINPROGRESS) + unlock_page(page); + else + r = 0; return r; } @@ -697,7 +700,7 @@ static int ceph_writepages_start(struct address_space *mapping, struct pagevec pvec; int done = 0; int rc = 0; - unsigned wsize = 1 << inode->i_blkbits; + unsigned int wsize = i_blocksize(inode); struct ceph_osd_request *req = NULL; int do_sync = 0; loff_t snap_size, i_size; @@ -1094,7 +1097,7 @@ retry_locked: goto retry_locked; r = writepage_nounlock(page, NULL); if (r < 0) - goto fail_nosnap; + goto fail_unlock; goto retry_locked; } @@ -1122,11 +1125,14 @@ retry_locked: } /* we need to read it. */ - r = readpage_nounlock(file, page); - if (r < 0) - goto fail_nosnap; + r = ceph_do_readpage(file, page); + if (r < 0) { + if (r == -EINPROGRESS) + return -EAGAIN; + goto fail_unlock; + } goto retry_locked; -fail_nosnap: +fail_unlock: unlock_page(page); return r; } diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index a4766ded1ba7..ff1cfd7b1083 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -224,13 +224,7 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) fscache_relinquish_cookie(cookie, 0); } -static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) -{ - if (!error) - SetPageUptodate(page); -} - -static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error) +static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error) { if (!error) SetPageUptodate(page); @@ -259,7 +253,7 @@ int ceph_readpage_from_fscache(struct inode *inode, struct page *page) return -ENOBUFS; ret = fscache_read_or_alloc_page(ci->fscache, page, - ceph_vfs_readpage_complete, NULL, + ceph_readpage_from_fscache_complete, NULL, GFP_KERNEL); switch (ret) { @@ -288,7 +282,7 @@ int ceph_readpages_from_fscache(struct inode *inode, return -ENOBUFS; ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, - ceph_vfs_readpage_complete_unlock, + ceph_readpage_from_fscache_complete, NULL, mapping_gfp_mask(mapping)); switch (ret) { diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 9314b4ea2375..be7d187d53fd 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -247,6 +247,11 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, if (ret < 0) err = ret; dput(last); + /* last_name no longer match cache index */ + if (fi->readdir_cache_idx >= 0) { + fi->readdir_cache_idx = -1; + fi->dir_release_count = 0; + } } return err; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3c68e6aee2f0..c8222bfe1e56 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -929,7 +929,8 @@ again: statret = __ceph_do_getattr(inode, page, CEPH_STAT_CAP_INLINE_DATA, !!page); if (statret < 0) { - __free_page(page); + if (page) + __free_page(page); if (statret == -ENODATA) { BUG_ON(retry_op != READ_INLINE); goto again; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 498dcfa2dcdb..9f0d99094cc1 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1358,15 +1358,20 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, if (!ctl->page || pgoff != page_index(ctl->page)) { ceph_readdir_cache_release(ctl); - ctl->page = grab_cache_page(&dir->i_data, pgoff); + if (idx == 0) + ctl->page = grab_cache_page(&dir->i_data, pgoff); + else + ctl->page = find_lock_page(&dir->i_data, pgoff); if (!ctl->page) { ctl->index = -1; - return -ENOMEM; + return idx == 0 ? -ENOMEM : 0; } /* reading/filling the cache are serialized by * i_mutex, no need to use page lock */ unlock_page(ctl->page); ctl->dentries = kmap(ctl->page); + if (idx == 0) + memset(ctl->dentries, 0, PAGE_CACHE_SIZE); } if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && @@ -1768,7 +1773,7 @@ static const struct inode_operations ceph_symlink_iops = { /* * setattr */ -int ceph_setattr(struct dentry *dentry, struct iattr *attr) +int __ceph_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); @@ -1970,11 +1975,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); - if (ia_valid & ATTR_MODE) { - err = posix_acl_chmod(inode, attr->ia_mode); - if (err) - goto out_put; - } if (mask) { req->r_inode = inode; @@ -1988,13 +1988,23 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ceph_cap_string(dirtied), mask); ceph_mdsc_put_request(req); - if (mask & CEPH_SETATTR_SIZE) - __ceph_do_pending_vmtruncate(inode); ceph_free_cap_flush(prealloc_cf); + + if (err >= 0 && (mask & CEPH_SETATTR_SIZE)) + __ceph_do_pending_vmtruncate(inode); + return err; -out_put: - ceph_mdsc_put_request(req); - ceph_free_cap_flush(prealloc_cf); +} + +int ceph_setattr(struct dentry *dentry, struct iattr *attr) +{ + int err; + + err = __ceph_setattr(dentry, attr); + + if (err >= 0 && (attr->ia_valid & ATTR_MODE)) + err = posix_acl_chmod(d_inode(dentry), attr->ia_mode); + return err; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index e7b130a637f9..f54f77037d22 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -274,12 +274,13 @@ static int parse_reply_info_extra(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features) { - if (info->head->op == CEPH_MDS_OP_GETFILELOCK) + u32 op = le32_to_cpu(info->head->op); + + if (op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); - else if (info->head->op == CEPH_MDS_OP_READDIR || - info->head->op == CEPH_MDS_OP_LSSNAP) + else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) return parse_reply_info_dir(p, end, info, features); - else if (info->head->op == CEPH_MDS_OP_CREATE) + else if (op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features); else return -EIO; @@ -643,6 +644,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc, { dout("__unregister_request %p tid %lld\n", req, req->r_tid); + /* Never leave an unregistered request on an unsafe list! */ + list_del_init(&req->r_unsafe_item); + if (req->r_tid == mdsc->oldest_tid) { struct rb_node *p = rb_next(&req->r_node); mdsc->oldest_tid = 0; @@ -1050,7 +1054,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, while (!list_empty(&session->s_unsafe)) { req = list_first_entry(&session->s_unsafe, struct ceph_mds_request, r_unsafe_item); - list_del_init(&req->r_unsafe_item); pr_warn_ratelimited(" dropping unsafe request %llu\n", req->r_tid); __unregister_request(mdsc, req); @@ -2476,7 +2479,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) * useful we could do with a revised return value. */ dout("got safe reply %llu, mds%d\n", tid, mds); - list_del_init(&req->r_unsafe_item); /* last unsafe request during umount? */ if (mdsc->stopping && !__get_oldest_req(mdsc)) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 75b7d125ce66..8c8cb8fe3d32 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -788,6 +788,7 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) return __ceph_do_getattr(inode, NULL, mask, force); } extern int ceph_permission(struct inode *inode, int mask); +extern int __ceph_setattr(struct dentry *dentry, struct iattr *attr); extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 819163d8313b..b24275ef97f7 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -369,6 +369,7 @@ static int __set_xattr(struct ceph_inode_info *ci, if (update_xattr) { int err = 0; + if (xattr && (flags & XATTR_CREATE)) err = -EEXIST; else if (!xattr && (flags & XATTR_REPLACE)) @@ -376,12 +377,14 @@ static int __set_xattr(struct ceph_inode_info *ci, if (err) { kfree(name); kfree(val); + kfree(*newxattr); return err; } if (update_xattr < 0) { if (xattr) __remove_xattr(ci, xattr); kfree(name); + kfree(*newxattr); return 0; } } diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 50b268483302..0a3544fb50f9 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -152,6 +152,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) list_for_each(tmp1, &cifs_tcp_ses_list) { server = list_entry(tmp1, struct TCP_Server_Info, tcp_ses_list); + seq_printf(m, "\nNumber of credits: %d", server->credits); i++; list_for_each(tmp2, &server->smb_ses_list) { ses = list_entry(tmp2, struct cifs_ses, @@ -255,7 +256,6 @@ static const struct file_operations cifs_debug_data_proc_fops = { static ssize_t cifs_stats_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - char c; bool bv; int rc; struct list_head *tmp1, *tmp2, *tmp3; @@ -263,11 +263,8 @@ static ssize_t cifs_stats_proc_write(struct file *file, struct cifs_ses *ses; struct cifs_tcon *tcon; - rc = get_user(c, buffer); - if (rc) - return rc; - - if (strtobool(&c, &bv) == 0) { + rc = kstrtobool_from_user(buffer, count, &bv); + if (rc == 0) { #ifdef CONFIG_CIFS_STATS2 atomic_set(&totBufAllocCount, 0); atomic_set(&totSmBufAllocCount, 0); @@ -290,6 +287,8 @@ static ssize_t cifs_stats_proc_write(struct file *file, } } spin_unlock(&cifs_tcp_ses_lock); + } else { + return rc; } return count; @@ -433,17 +432,17 @@ static int cifsFYI_proc_open(struct inode *inode, struct file *file) static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - char c; + char c[2] = { '\0' }; bool bv; int rc; - rc = get_user(c, buffer); + rc = get_user(c[0], buffer); if (rc) return rc; - if (strtobool(&c, &bv) == 0) + if (strtobool(c, &bv) == 0) cifsFYI = bv; - else if ((c > '1') && (c <= '9')) - cifsFYI = (int) (c - '0'); /* see cifs_debug.h for meanings */ + else if ((c[0] > '1') && (c[0] <= '9')) + cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */ return count; } @@ -471,20 +470,12 @@ static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file) static ssize_t cifs_linux_ext_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - char c; - bool bv; int rc; - rc = get_user(c, buffer); + rc = kstrtobool_from_user(buffer, count, &linuxExtEnabled); if (rc) return rc; - rc = strtobool(&c, &bv); - if (rc) - return rc; - - linuxExtEnabled = bv; - return count; } @@ -511,20 +502,12 @@ static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file) static ssize_t cifs_lookup_cache_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - char c; - bool bv; int rc; - rc = get_user(c, buffer); + rc = kstrtobool_from_user(buffer, count, &lookupCacheEnabled); if (rc) return rc; - rc = strtobool(&c, &bv); - if (rc) - return rc; - - lookupCacheEnabled = bv; - return count; } @@ -551,20 +534,12 @@ static int traceSMB_proc_open(struct inode *inode, struct file *file) static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - char c; - bool bv; int rc; - rc = get_user(c, buffer); + rc = kstrtobool_from_user(buffer, count, &traceSMB); if (rc) return rc; - rc = strtobool(&c, &bv); - if (rc) - return rc; - - traceSMB = bv; - return count; } @@ -622,7 +597,6 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, int rc; unsigned int flags; char flags_string[12]; - char c; bool bv; if ((count < 1) || (count > 11)) @@ -635,11 +609,10 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, if (count < 3) { /* single char or single char followed by null */ - c = flags_string[0]; - if (strtobool(&c, &bv) == 0) { + if (strtobool(flags_string, &bv) == 0) { global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF; return count; - } else if (!isdigit(c)) { + } else if (!isdigit(flags_string[0])) { cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", flags_string); return -EINVAL; diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index 66cf0f9fff89..c611ca2339d7 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -25,7 +25,7 @@ void cifs_dump_mem(char *label, void *data, int length); void cifs_dump_detail(void *); void cifs_dump_mids(struct TCP_Server_Info *); -extern int traceSMB; /* flag which enables the function below */ +extern bool traceSMB; /* flag which enables the function below */ void dump_smb(void *, int); #define CIFS_INFO 0x01 #define CIFS_RC 0x02 diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 3182273a3407..1418daa03d95 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -46,6 +46,9 @@ #define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */ #define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */ #define CIFS_MOUNT_MAP_SFM_CHR 0x800000 /* SFM/MAC mapping for illegal chars */ +#define CIFS_MOUNT_USE_PREFIX_PATH 0x1000000 /* make subpath with unaccessible + * root mountable + */ struct cifs_sb_info { struct rb_root tlink_tree; @@ -67,5 +70,6 @@ struct cifs_sb_info { struct backing_dev_info bdi; struct delayed_work prune_tlinks; struct rcu_head rcu; + char *prepath; }; #endif /* _CIFS_FS_SB_H */ diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 5a53ac6b1e02..a0b3e7d1be48 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -83,6 +83,9 @@ convert_sfm_char(const __u16 src_char, char *target) case SFM_COLON: *target = ':'; break; + case SFM_DOUBLEQUOTE: + *target = '"'; + break; case SFM_ASTERISK: *target = '*'; break; @@ -101,6 +104,12 @@ convert_sfm_char(const __u16 src_char, char *target) case SFM_SLASH: *target = '\\'; break; + case SFM_SPACE: + *target = ' '; + break; + case SFM_PERIOD: + *target = '.'; + break; default: return false; } @@ -404,7 +413,7 @@ static __le16 convert_to_sfu_char(char src_char) return dest_char; } -static __le16 convert_to_sfm_char(char src_char) +static __le16 convert_to_sfm_char(char src_char, bool end_of_string) { __le16 dest_char; @@ -412,6 +421,9 @@ static __le16 convert_to_sfm_char(char src_char) case ':': dest_char = cpu_to_le16(SFM_COLON); break; + case '"': + dest_char = cpu_to_le16(SFM_DOUBLEQUOTE); + break; case '*': dest_char = cpu_to_le16(SFM_ASTERISK); break; @@ -427,6 +439,18 @@ static __le16 convert_to_sfm_char(char src_char) case '|': dest_char = cpu_to_le16(SFM_PIPE); break; + case '.': + if (end_of_string) + dest_char = cpu_to_le16(SFM_PERIOD); + else + dest_char = 0; + break; + case ' ': + if (end_of_string) + dest_char = cpu_to_le16(SFM_SPACE); + else + dest_char = 0; + break; default: dest_char = 0; } @@ -469,9 +493,16 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, /* see if we must remap this char */ if (map_chars == SFU_MAP_UNI_RSVD) dst_char = convert_to_sfu_char(src_char); - else if (map_chars == SFM_MAP_UNI_RSVD) - dst_char = convert_to_sfm_char(src_char); - else + else if (map_chars == SFM_MAP_UNI_RSVD) { + bool end_of_string; + + if (i == srclen - 1) + end_of_string = true; + else + end_of_string = false; + + dst_char = convert_to_sfm_char(src_char, end_of_string); + } else dst_char = 0; /* * FIXME: We can not handle remapping backslash (UNI_SLASH) diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index bdc52cb9a676..07ade707fa60 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -57,6 +57,7 @@ * not conflict (although almost does) with the mapping above. */ +#define SFM_DOUBLEQUOTE ((__u16) 0xF020) #define SFM_ASTERISK ((__u16) 0xF021) #define SFM_QUESTION ((__u16) 0xF025) #define SFM_COLON ((__u16) 0xF022) @@ -64,6 +65,8 @@ #define SFM_LESSTHAN ((__u16) 0xF023) #define SFM_PIPE ((__u16) 0xF027) #define SFM_SLASH ((__u16) 0xF026) +#define SFM_SPACE ((__u16) 0xF028) +#define SFM_PERIOD ((__u16) 0xF029) /* * Mapping mechanism to use when one of the seven reserved characters is diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index e682b36a210f..4acbc390a7d6 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -731,24 +731,26 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) memcpy(ses->auth_key.response + baselen, tiblob, tilen); + mutex_lock(&ses->server->srv_mutex); + rc = crypto_hmacmd5_alloc(ses->server); if (rc) { cifs_dbg(VFS, "could not crypto alloc hmacmd5 rc %d\n", rc); - goto setup_ntlmv2_rsp_ret; + goto unlock; } /* calculate ntlmv2_hash */ rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp); if (rc) { cifs_dbg(VFS, "could not get v2 hash rc %d\n", rc); - goto setup_ntlmv2_rsp_ret; + goto unlock; } /* calculate first part of the client response (CR1) */ rc = CalcNTLMv2_response(ses, ntlmv2_hash); if (rc) { cifs_dbg(VFS, "Could not calculate CR1 rc: %d\n", rc); - goto setup_ntlmv2_rsp_ret; + goto unlock; } /* now calculate the session key for NTLMv2 */ @@ -757,13 +759,13 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) if (rc) { cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", __func__); - goto setup_ntlmv2_rsp_ret; + goto unlock; } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); - goto setup_ntlmv2_rsp_ret; + goto unlock; } rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, @@ -771,7 +773,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) CIFS_HMAC_MD5_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); - goto setup_ntlmv2_rsp_ret; + goto unlock; } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, @@ -779,6 +781,8 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); +unlock: + mutex_unlock(&ses->server->srv_mutex); setup_ntlmv2_rsp_ret: kfree(tiblob); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index cbc0f4bca0c0..4f4fc9ff3636 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -54,10 +54,10 @@ #endif int cifsFYI = 0; -int traceSMB = 0; +bool traceSMB; bool enable_oplocks = true; -unsigned int linuxExtEnabled = 1; -unsigned int lookupCacheEnabled = 1; +bool linuxExtEnabled = true; +bool lookupCacheEnabled = true; unsigned int global_secflags = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ unsigned int sign_CIFS_PDUs = 1; @@ -268,7 +268,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->createtime = 0; cifs_inode->epoch = 0; #ifdef CONFIG_CIFS_SMB2 - get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE); + generate_random_uuid(cifs_inode->lease_key); #endif /* * Can not set i_flags here - they get immediately overwritten to zero @@ -686,6 +686,14 @@ cifs_do_mount(struct file_system_type *fs_type, goto out_cifs_sb; } + if (volume_info->prepath) { + cifs_sb->prepath = kstrdup(volume_info->prepath, GFP_KERNEL); + if (cifs_sb->prepath == NULL) { + root = ERR_PTR(-ENOMEM); + goto out_cifs_sb; + } + } + cifs_setup_cifs_sb(volume_info, cifs_sb); rc = cifs_mount(cifs_sb, volume_info); @@ -724,7 +732,11 @@ cifs_do_mount(struct file_system_type *fs_type, sb->s_flags |= MS_ACTIVE; } - root = cifs_get_root(volume_info, sb); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) + root = dget(sb->s_root); + else + root = cifs_get_root(volume_info, sb); + if (IS_ERR(root)) goto out_super; @@ -1198,7 +1210,6 @@ init_cifs(void) GlobalTotalActiveXid = 0; GlobalMaxActiveXid = 0; spin_lock_init(&cifs_tcp_ses_lock); - spin_lock_init(&cifs_file_list_lock); spin_lock_init(&GlobalMid_Lock); if (cifs_max_pending < 2) { diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 2b510c537a0d..e2f6a79e9b01 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -227,6 +227,7 @@ struct smb_version_operations { /* verify the message */ int (*check_message)(char *, unsigned int); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); + int (*handle_cancelled_mid)(char *, struct TCP_Server_Info *); void (*downgrade_oplock)(struct TCP_Server_Info *, struct cifsInodeInfo *, bool); /* process transaction2 response */ @@ -627,6 +628,8 @@ struct TCP_Server_Info { #ifdef CONFIG_CIFS_SMB2 unsigned int max_read; unsigned int max_write; + struct delayed_work reconnect; /* reconnect workqueue job */ + struct mutex reconnect_mutex; /* prevent simultaneous reconnects */ #endif /* CONFIG_CIFS_SMB2 */ }; @@ -826,7 +829,9 @@ cap_unix(struct cifs_ses *ses) struct cifs_tcon { struct list_head tcon_list; int tc_count; + struct list_head rlist; /* reconnect list */ struct list_head openFileList; + spinlock_t open_file_lock; /* protects list above */ struct cifs_ses *ses; /* pointer to session associated with */ char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ char *nativeFileSystem; @@ -883,7 +888,7 @@ struct cifs_tcon { #endif /* CONFIG_CIFS_STATS2 */ __u64 bytes_read; __u64 bytes_written; - spinlock_t stat_lock; + spinlock_t stat_lock; /* protects the two fields above */ #endif /* CONFIG_CIFS_STATS */ FILE_SYSTEM_DEVICE_INFO fsDevInfo; FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */ @@ -902,7 +907,6 @@ struct cifs_tcon { bool use_persistent:1; /* use persistent instead of durable handles */ #ifdef CONFIG_CIFS_SMB2 bool print:1; /* set if connection to printer share */ - bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */ __le32 capabilities; __u32 share_flags; __u32 maximal_access; @@ -1034,8 +1038,10 @@ struct cifs_fid_locks { }; struct cifsFileInfo { + /* following two lists are protected by tcon->open_file_lock */ struct list_head tlist; /* pointer to next fid owned by tcon */ struct list_head flist; /* next fid (file instance) for this inode */ + /* lock list below protected by cifsi->lock_sem */ struct cifs_fid_locks *llist; /* brlocks held by this fid */ kuid_t uid; /* allows finding which FileInfo structure */ __u32 pid; /* process id who opened file */ @@ -1043,11 +1049,12 @@ struct cifsFileInfo { /* BB add lock scope info here if needed */ ; /* lock scope id (0 if none) */ struct dentry *dentry; - unsigned int f_flags; struct tcon_link *tlink; + unsigned int f_flags; bool invalidHandle:1; /* file closed via session abend */ bool oplock_break_cancelled:1; - int count; /* refcount protected by cifs_file_list_lock */ + int count; + spinlock_t file_info_lock; /* protects four flag/count fields above */ struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; struct work_struct oplock_break; /* work for oplock breaks */ @@ -1114,7 +1121,7 @@ struct cifs_writedata { /* * Take a reference on the file private data. Must be called with - * cifs_file_list_lock held. + * cfile->file_info_lock held. */ static inline void cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) @@ -1283,12 +1290,19 @@ struct mid_q_entry { void *callback_data; /* general purpose pointer for callback */ void *resp_buf; /* pointer to received SMB header */ int mid_state; /* wish this were enum but can not pass to wait_event */ + unsigned int mid_flags; __le16 command; /* smb command code */ bool large_buf:1; /* if valid response, is pointer to large buf */ bool multiRsp:1; /* multiple trans2 responses for one request */ bool multiEnd:1; /* both received */ }; +struct close_cancelled_open { + struct cifs_fid fid; + struct cifs_tcon *tcon; + struct work_struct work; +}; + /* Make code in transport.c a little cleaner by moving update of optional stats into function below */ #ifdef CONFIG_CIFS_STATS2 @@ -1420,6 +1434,9 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param, #define MID_RESPONSE_MALFORMED 0x10 #define MID_SHUTDOWN 0x20 +/* Flags */ +#define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */ + /* Types of response buffer returned from SendReceive2 */ #define CIFS_NO_BUFFER 0 /* Response buffer not returned */ #define CIFS_SMALL_BUFFER 1 @@ -1508,8 +1525,10 @@ require use of the stronger protocol */ * GlobalMid_Lock protects: * list operations on pending_mid_q and oplockQ * updates to XID counters, multiplex id and SMB sequence numbers - * cifs_file_list_lock protects: - * list operations on tcp and SMB session lists and tCon lists + * tcp_ses_lock protects: + * list operations on tcp and SMB session lists + * tcon->open_file_lock protects the list of open files hanging off the tcon + * cfile->file_info_lock protects counters and fields in cifs file struct * f_owner.lock protects certain per file struct operations * mapping->page_lock protects certain per page operations * @@ -1541,18 +1560,12 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list; * tcp session, and the list of tcon's per smb session. It also protects * the reference counters for the server, smb session, and tcon. Finally, * changes to the tcon->tidStatus should be done while holding this lock. + * generally the locks should be taken in order tcp_ses_lock before + * tcon->open_file_lock and that before file->file_info_lock since the + * structure order is cifs_socket-->cifs_ses-->cifs_tcon-->cifs_file */ GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock; -/* - * This lock protects the cifs_file->llist and cifs_file->flist - * list operations, and updates to some flags (cifs_file->invalidHandle) - * It will be moved to either use the tcon->stat_lock or equivalent later. - * If cifs_tcp_ses_lock and the lock below are both needed to be held, then - * the cifs_tcp_ses_lock must be grabbed first and released last. - */ -GLOBAL_EXTERN spinlock_t cifs_file_list_lock; - #ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ /* Outstanding dir notify requests */ GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; @@ -1588,11 +1601,11 @@ GLOBAL_EXTERN atomic_t midCount; /* Misc globals */ GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */ -GLOBAL_EXTERN unsigned int lookupCacheEnabled; +GLOBAL_EXTERN bool lookupCacheEnabled; GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent with more secure ntlmssp2 challenge/resp */ GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ -GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ +GLOBAL_EXTERN bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index c63fd1dde25b..54590fd33df1 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -205,6 +205,9 @@ extern void cifs_add_pending_open_locked(struct cifs_fid *fid, struct tcon_link *tlink, struct cifs_pending_open *open); extern void cifs_del_pending_open(struct cifs_pending_open *open); +extern void cifs_put_tcp_session(struct TCP_Server_Info *server, + int from_reconnect); +extern void cifs_put_tcon(struct cifs_tcon *tcon); #if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) extern void cifs_dfs_release_automount_timer(void); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 76fcb50295a3..b60150e5b5ce 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -98,13 +98,13 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) struct list_head *tmp1; /* list all files open on tree connection and mark them invalid */ - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each_safe(tmp, tmp1, &tcon->openFileList) { open_file = list_entry(tmp, struct cifsFileInfo, tlist); open_file->invalidHandle = true; open_file->oplock_break_cancelled = true; } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); /* * BB Add call to invalidate_inodes(sb) for all superblocks mounted * to this tcon. @@ -717,6 +717,9 @@ CIFSSMBEcho(struct TCP_Server_Info *server) if (rc) return rc; + if (server->capabilities & CAP_UNICODE) + smb->hdr.Flags2 |= SMBFLG2_UNICODE; + /* set up echo request */ smb->hdr.Tid = 0xffff; smb->hdr.WordCount = 1; @@ -1424,6 +1427,8 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) length = discard_remaining_data(server); dequeue_mid(mid, rdata->result); + mid->resp_buf = server->smallbuf; + server->smallbuf = NULL; return length; } @@ -1538,6 +1543,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return cifs_readv_discard(server, mid); dequeue_mid(mid, false); + mid->resp_buf = server->smallbuf; + server->smallbuf = NULL; return length; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 3c194ff0d2f0..53a827c6d8b1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -52,6 +52,9 @@ #include "nterr.h" #include "rfc1002pdu.h" #include "fscache.h" +#ifdef CONFIG_CIFS_SMB2 +#include "smb2proto.h" +#endif #define CIFS_PORT 445 #define RFC1001_PORT 139 @@ -409,6 +412,9 @@ cifs_reconnect(struct TCP_Server_Info *server) } } while (server->tcpStatus == CifsNeedReconnect); + if (server->tcpStatus == CifsNeedNegotiate) + mod_delayed_work(cifsiod_wq, &server->echo, 0); + return rc; } @@ -418,16 +424,27 @@ cifs_echo_request(struct work_struct *work) int rc; struct TCP_Server_Info *server = container_of(work, struct TCP_Server_Info, echo.work); + unsigned long echo_interval; + + /* + * If we need to renegotiate, set echo interval to zero to + * immediately call echo service where we can renegotiate. + */ + if (server->tcpStatus == CifsNeedNegotiate) + echo_interval = 0; + else + echo_interval = SMB_ECHO_INTERVAL; /* - * We cannot send an echo if it is disabled or until the - * NEGOTIATE_PROTOCOL request is done, which is indicated by - * server->ops->need_neg() == true. Also, no need to ping if - * we got a response recently. + * We cannot send an echo if it is disabled. + * Also, no need to ping if we got a response recently. */ - if (!server->ops->need_neg || server->ops->need_neg(server) || + + if (server->tcpStatus == CifsNeedReconnect || + server->tcpStatus == CifsExiting || + server->tcpStatus == CifsNew || (server->ops->can_echo && !server->ops->can_echo(server)) || - time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ)) + time_before(jiffies, server->lstrp + echo_interval - HZ)) goto requeue_echo; rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS; @@ -919,10 +936,19 @@ cifs_demultiplex_thread(void *p) server->lstrp = jiffies; if (mid_entry != NULL) { + if ((mid_entry->mid_flags & MID_WAIT_CANCELLED) && + mid_entry->mid_state == MID_RESPONSE_RECEIVED && + server->ops->handle_cancelled_mid) + server->ops->handle_cancelled_mid( + mid_entry->resp_buf, + server); + if (!mid_entry->multiRsp || mid_entry->multiEnd) mid_entry->callback(mid_entry); - } else if (!server->ops->is_oplock_break || - !server->ops->is_oplock_break(buf, server)) { + } else if (server->ops->is_oplock_break && + server->ops->is_oplock_break(buf, server)) { + cifs_dbg(FYI, "Received oplock break\n"); + } else { cifs_dbg(VFS, "No task to wake, unknown frame received! NumMids %d\n", atomic_read(&midCount)); cifs_dump_mem("Received Data is: ", buf, @@ -2111,8 +2137,8 @@ cifs_find_tcp_session(struct smb_vol *vol) return NULL; } -static void -cifs_put_tcp_session(struct TCP_Server_Info *server) +void +cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) { struct task_struct *task; @@ -2129,6 +2155,19 @@ cifs_put_tcp_session(struct TCP_Server_Info *server) cancel_delayed_work_sync(&server->echo); +#ifdef CONFIG_CIFS_SMB2 + if (from_reconnect) + /* + * Avoid deadlock here: reconnect work calls + * cifs_put_tcp_session() at its end. Need to be sure + * that reconnect work does nothing with server pointer after + * that step. + */ + cancel_delayed_work(&server->reconnect); + else + cancel_delayed_work_sync(&server->reconnect); +#endif + spin_lock(&GlobalMid_Lock); server->tcpStatus = CifsExiting; spin_unlock(&GlobalMid_Lock); @@ -2193,12 +2232,16 @@ cifs_get_tcp_session(struct smb_vol *volume_info) INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); INIT_LIST_HEAD(&tcp_ses->smb_ses_list); INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request); +#ifdef CONFIG_CIFS_SMB2 + INIT_DELAYED_WORK(&tcp_ses->reconnect, smb2_reconnect_server); + mutex_init(&tcp_ses->reconnect_mutex); +#endif memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr, sizeof(tcp_ses->srcaddr)); memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr, sizeof(tcp_ses->dstaddr)); #ifdef CONFIG_CIFS_SMB2 - get_random_bytes(tcp_ses->client_guid, SMB2_CLIENT_GUID_SIZE); + generate_random_uuid(tcp_ses->client_guid); #endif /* * at this point we are the only ones with the pointer @@ -2345,7 +2388,7 @@ cifs_put_smb_ses(struct cifs_ses *ses) spin_unlock(&cifs_tcp_ses_lock); sesInfoFree(ses); - cifs_put_tcp_session(server); + cifs_put_tcp_session(server, 0); } #ifdef CONFIG_KEYS @@ -2519,7 +2562,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) mutex_unlock(&ses->session_mutex); /* existing SMB ses has a server reference already */ - cifs_put_tcp_session(server); + cifs_put_tcp_session(server, 0); free_xid(xid); return ses; } @@ -2609,7 +2652,7 @@ cifs_find_tcon(struct cifs_ses *ses, const char *unc) return NULL; } -static void +void cifs_put_tcon(struct cifs_tcon *tcon) { unsigned int xid; @@ -3515,6 +3558,44 @@ cifs_get_volume_info(char *mount_data, const char *devname) return volume_info; } +static int +cifs_are_all_path_components_accessible(struct TCP_Server_Info *server, + unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + char *full_path) +{ + int rc; + char *s; + char sep, tmp; + + sep = CIFS_DIR_SEP(cifs_sb); + s = full_path; + + rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, ""); + while (rc == 0) { + /* skip separators */ + while (*s == sep) + s++; + if (!*s) + break; + /* next separator */ + while (*s && *s != sep) + s++; + + /* + * temporarily null-terminate the path at the end of + * the current component + */ + tmp = *s; + *s = 0; + rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, + full_path); + *s = tmp; + } + return rc; +} + int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) { @@ -3652,6 +3733,18 @@ remote_path_check: kfree(full_path); goto mount_fail_check; } + + if (rc != -EREMOTE) { + rc = cifs_are_all_path_components_accessible(server, + xid, tcon, cifs_sb, + full_path); + if (rc != 0) { + cifs_dbg(VFS, "cannot query dirs between root and final path, " + "enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; + rc = 0; + } + } kfree(full_path); } @@ -3715,7 +3808,7 @@ mount_fail_check: else if (ses) cifs_put_smb_ses(ses); else - cifs_put_tcp_session(server); + cifs_put_tcp_session(server, 0); bdi_destroy(&cifs_sb->bdi); } @@ -3921,6 +4014,7 @@ cifs_umount(struct cifs_sb_info *cifs_sb) bdi_destroy(&cifs_sb->bdi); kfree(cifs_sb->mountdata); + kfree(cifs_sb->prepath); call_rcu(&cifs_sb->rcu, delayed_free); } @@ -4025,7 +4119,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info); if (IS_ERR(ses)) { tcon = (struct cifs_tcon *)ses; - cifs_put_tcp_session(master_tcon->ses->server); + cifs_put_tcp_session(master_tcon->ses->server, 0); goto out; } diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index c3eb998a99bd..297e05c9e2b0 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -84,6 +84,7 @@ build_path_from_dentry(struct dentry *direntry) struct dentry *temp; int namelen; int dfsplen; + int pplen = 0; char *full_path; char dirsep; struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); @@ -95,8 +96,12 @@ build_path_from_dentry(struct dentry *direntry) dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); else dfsplen = 0; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) + pplen = cifs_sb->prepath ? strlen(cifs_sb->prepath) + 1 : 0; + cifs_bp_rename_retry: - namelen = dfsplen; + namelen = dfsplen + pplen; seq = read_seqbegin(&rename_lock); rcu_read_lock(); for (temp = direntry; !IS_ROOT(temp);) { @@ -137,7 +142,7 @@ cifs_bp_rename_retry: } } rcu_read_unlock(); - if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) { + if (namelen != dfsplen + pplen || read_seqretry(&rename_lock, seq)) { cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n", namelen, dfsplen); /* presumably this is only possible if racing with a rename @@ -153,6 +158,17 @@ cifs_bp_rename_retry: those safely to '/' if any are found in the middle of the prepath */ /* BB test paths to Windows with '/' in the midst of prepath */ + if (pplen) { + int i; + + cifs_dbg(FYI, "using cifs_sb prepath <%s>\n", cifs_sb->prepath); + memcpy(full_path+dfsplen+1, cifs_sb->prepath, pplen-1); + full_path[dfsplen] = '\\'; + for (i = 0; i < pplen-1; i++) + if (full_path[dfsplen+1+i] == '/') + full_path[dfsplen+1+i] = CIFS_DIR_SEP(cifs_sb); + } + if (dfsplen) { strncpy(full_path, tcon->treeName, dfsplen); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { @@ -167,15 +183,20 @@ cifs_bp_rename_retry: } /* + * Don't allow path components longer than the server max. * Don't allow the separator character in a path component. * The VFS will not allow "/", but "\" is allowed by posix. */ static int -check_name(struct dentry *direntry) +check_name(struct dentry *direntry, struct cifs_tcon *tcon) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); int i; + if (unlikely(direntry->d_name.len > + le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength))) + return -ENAMETOOLONG; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { for (i = 0; i < direntry->d_name.len; i++) { if (direntry->d_name.name[i] == '\\') { @@ -229,6 +250,13 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, goto cifs_create_get_file_info; } + if (S_ISDIR(newinode->i_mode)) { + CIFSSMBClose(xid, tcon, fid->netfid); + iput(newinode); + rc = -EISDIR; + goto out; + } + if (!S_ISREG(newinode->i_mode)) { /* * The server may allow us to open things like @@ -399,10 +427,14 @@ cifs_create_set_dentry: if (rc != 0) { cifs_dbg(FYI, "Create worked, get_inode_info failed rc = %d\n", rc); - if (server->ops->close) - server->ops->close(xid, tcon, fid); - goto out; + goto out_err; } + + if (S_ISDIR(newinode->i_mode)) { + rc = -EISDIR; + goto out_err; + } + d_drop(direntry); d_add(direntry, newinode); @@ -410,6 +442,13 @@ out: kfree(buf); kfree(full_path); return rc; + +out_err: + if (server->ops->close) + server->ops->close(xid, tcon, fid); + if (newinode) + iput(newinode); + goto out; } int @@ -455,10 +494,6 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, return finish_no_open(file, res); } - rc = check_name(direntry); - if (rc) - return rc; - xid = get_xid(); cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n", @@ -471,6 +506,11 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, } tcon = tlink_tcon(tlink); + + rc = check_name(direntry, tcon); + if (rc) + goto out_free_xid; + server = tcon->ses->server; if (server->ops->new_lease_key) @@ -731,7 +771,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } pTcon = tlink_tcon(tlink); - rc = check_name(direntry); + rc = check_name(direntry, pTcon); if (rc) goto lookup_out; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0068e82217c3..a0c0a49b6620 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -305,6 +305,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->tlink = cifs_get_tlink(tlink); INIT_WORK(&cfile->oplock_break, cifs_oplock_break); mutex_init(&cfile->fh_mutex); + spin_lock_init(&cfile->file_info_lock); cifs_sb_active(inode->i_sb); @@ -317,7 +318,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, oplock = 0; } - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock) oplock = fid->pending_open->oplock; list_del(&fid->pending_open->olist); @@ -326,12 +327,13 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, server->ops->set_fid(cfile, fid, oplock); list_add(&cfile->tlist, &tcon->openFileList); + /* if readable file instance put first in list*/ if (file->f_mode & FMODE_READ) list_add(&cfile->flist, &cinode->openFileList); else list_add_tail(&cfile->flist, &cinode->openFileList); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); if (fid->purge_cache) cifs_zap_mapping(inode); @@ -343,16 +345,16 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct cifsFileInfo * cifsFileInfo_get(struct cifsFileInfo *cifs_file) { - spin_lock(&cifs_file_list_lock); + spin_lock(&cifs_file->file_info_lock); cifsFileInfo_get_locked(cifs_file); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_file->file_info_lock); return cifs_file; } /* * Release a reference on the file private data. This may involve closing * the filehandle out on the server. Must be called without holding - * cifs_file_list_lock. + * tcon->open_file_lock and cifs_file->file_info_lock. */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { @@ -367,11 +369,15 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) struct cifs_pending_open open; bool oplock_break_cancelled; - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); + + spin_lock(&cifs_file->file_info_lock); if (--cifs_file->count > 0) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_file->file_info_lock); + spin_unlock(&tcon->open_file_lock); return; } + spin_unlock(&cifs_file->file_info_lock); if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &fid); @@ -395,7 +401,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) set_bit(CIFS_INO_INVALID_MAPPING, &cifsi->flags); cifs_set_oplock_level(cifsi, 0); } - spin_unlock(&cifs_file_list_lock); + + spin_unlock(&tcon->open_file_lock); oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break); @@ -772,10 +779,10 @@ int cifs_closedir(struct inode *inode, struct file *file) server = tcon->ses->server; cifs_dbg(FYI, "Freeing private data in close dir\n"); - spin_lock(&cifs_file_list_lock); + spin_lock(&cfile->file_info_lock); if (server->ops->dir_needs_close(cfile)) { cfile->invalidHandle = true; - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); if (server->ops->close_dir) rc = server->ops->close_dir(xid, tcon, &cfile->fid); else @@ -784,7 +791,7 @@ int cifs_closedir(struct inode *inode, struct file *file) /* not much we can do if it fails anyway, ignore rc */ rc = 0; } else - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); buf = cfile->srch_inf.ntwrk_buf_start; if (buf) { @@ -1720,12 +1727,13 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, { struct cifsFileInfo *open_file = NULL; struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); /* we could simply get the first_list_entry since write-only entries are always at the end of the list but since the first entry might have a close pending, we go through the whole list */ @@ -1736,8 +1744,8 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, if (!open_file->invalidHandle) { /* found a good file */ /* lock it so it will not be closed on us */ - cifsFileInfo_get_locked(open_file); - spin_unlock(&cifs_file_list_lock); + cifsFileInfo_get(open_file); + spin_unlock(&tcon->open_file_lock); return open_file; } /* else might as well continue, and look for another, or simply have the caller reopen it @@ -1745,7 +1753,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, } else /* write only file */ break; /* write only files are last so must be done */ } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return NULL; } @@ -1754,6 +1762,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, { struct cifsFileInfo *open_file, *inv_file = NULL; struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; bool any_available = false; int rc; unsigned int refind = 0; @@ -1769,15 +1778,16 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, } cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); refind_writable: if (refind > MAX_REOPEN_ATT) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return NULL; } list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { @@ -1788,8 +1798,8 @@ refind_writable: if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { if (!open_file->invalidHandle) { /* found a good writable file */ - cifsFileInfo_get_locked(open_file); - spin_unlock(&cifs_file_list_lock); + cifsFileInfo_get(open_file); + spin_unlock(&tcon->open_file_lock); return open_file; } else { if (!inv_file) @@ -1805,24 +1815,24 @@ refind_writable: if (inv_file) { any_available = false; - cifsFileInfo_get_locked(inv_file); + cifsFileInfo_get(inv_file); } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); if (inv_file) { rc = cifs_reopen_file(inv_file, false); if (!rc) return inv_file; else { - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_move_tail(&inv_file->flist, &cifs_inode->openFileList); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); cifsFileInfo_put(inv_file); - spin_lock(&cifs_file_list_lock); ++refind; inv_file = NULL; + spin_lock(&tcon->open_file_lock); goto refind_writable; } } @@ -2535,7 +2545,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, wdata->credits = credits; if (!wdata->cfile->invalidHandle || - !cifs_reopen_file(wdata->cfile, false)) + !(rc = cifs_reopen_file(wdata->cfile, false))) rc = server->ops->async_writev(wdata, cifs_uncached_writedata_release); if (rc) { @@ -2948,7 +2958,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, rdata->credits = credits; if (!rdata->cfile->invalidHandle || - !cifs_reopen_file(rdata->cfile, true)) + !(rc = cifs_reopen_file(rdata->cfile, true))) rc = server->ops->async_readv(rdata); error: if (rc) { @@ -3534,7 +3544,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, } if (!rdata->cfile->invalidHandle || - !cifs_reopen_file(rdata->cfile, true)) + !(rc = cifs_reopen_file(rdata->cfile, true))) rc = server->ops->async_readv(rdata); if (rc) { add_credits_and_wake_if(server, rdata->credits, 0); @@ -3632,15 +3642,17 @@ static int cifs_readpage(struct file *file, struct page *page) static int is_inode_writable(struct cifsInodeInfo *cifs_inode) { struct cifsFileInfo *open_file; + struct cifs_tcon *tcon = + cifs_sb_master_tcon(CIFS_SB(cifs_inode->vfs_inode.i_sb)); - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return 1; } } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return 0; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a329f5ba35aa..9cdeb0293267 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -982,10 +982,26 @@ struct inode *cifs_root_iget(struct super_block *sb) struct inode *inode = NULL; long rc; struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + char *path = NULL; + int len; + + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) + && cifs_sb->prepath) { + len = strlen(cifs_sb->prepath); + path = kzalloc(len + 2 /* leading sep + null */, GFP_KERNEL); + if (path == NULL) + return ERR_PTR(-ENOMEM); + path[0] = '/'; + memcpy(path+1, cifs_sb->prepath, len); + } else { + path = kstrdup("", GFP_KERNEL); + if (path == NULL) + return ERR_PTR(-ENOMEM); + } xid = get_xid(); if (tcon->unix_ext) { - rc = cifs_get_inode_info_unix(&inode, "", sb, xid); + rc = cifs_get_inode_info_unix(&inode, path, sb, xid); /* some servers mistakenly claim POSIX support */ if (rc != -EOPNOTSUPP) goto iget_no_retry; @@ -993,7 +1009,8 @@ struct inode *cifs_root_iget(struct super_block *sb) tcon->unix_ext = false; } - rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL); + convert_delimiter(path, CIFS_DIR_SEP(cifs_sb)); + rc = cifs_get_inode_info(&inode, path, NULL, sb, xid, NULL); iget_no_retry: if (!inode) { @@ -1022,6 +1039,7 @@ iget_no_retry: } out: + kfree(path); /* can not call macro free_xid here since in a void func * TODO: This is no longer true */ diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 35cf990f87d3..a8f5b31636dc 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -272,6 +272,8 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EOPNOTSUPP; break; case CIFS_IOC_GET_MNT_INFO: + if (pSMBFile == NULL) + break; tcon = tlink_tcon(pSMBFile->tlink); rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg); break; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 8442b8b8e0be..2396ab099849 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -120,6 +120,7 @@ tconInfoAlloc(void) ++ret_buf->tc_count; INIT_LIST_HEAD(&ret_buf->openFileList); INIT_LIST_HEAD(&ret_buf->tcon_list); + spin_lock_init(&ret_buf->open_file_lock); #ifdef CONFIG_CIFS_STATS spin_lock_init(&ret_buf->stat_lock); #endif @@ -465,7 +466,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) continue; cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each(tmp2, &tcon->openFileList) { netfile = list_entry(tmp2, struct cifsFileInfo, tlist); @@ -495,11 +496,11 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) &netfile->oplock_break); netfile->oplock_break_cancelled = false; - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); return true; } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "No matching file for oplock break\n"); return true; @@ -613,9 +614,9 @@ backup_cred(struct cifs_sb_info *cifs_sb) void cifs_del_pending_open(struct cifs_pending_open *open) { - spin_lock(&cifs_file_list_lock); + spin_lock(&tlink_tcon(open->tlink)->open_file_lock); list_del(&open->olist); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tlink_tcon(open->tlink)->open_file_lock); } void @@ -635,7 +636,7 @@ void cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink, struct cifs_pending_open *open) { - spin_lock(&cifs_file_list_lock); + spin_lock(&tlink_tcon(tlink)->open_file_lock); cifs_add_pending_open_locked(fid, tlink, open); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tlink_tcon(open->tlink)->open_file_lock); } diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index 848249fa120f..3079b38f0afb 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -133,6 +133,6 @@ typedef struct _AUTHENTICATE_MESSAGE { int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, struct cifs_ses *ses); void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, struct cifs_ses *ses); -int build_ntlmssp_auth_blob(unsigned char *pbuffer, u16 *buflen, +int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, const struct nls_table *nls_cp); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index b30a4a6d98a0..97d1a15873c5 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -282,6 +282,7 @@ initiate_cifs_search(const unsigned int xid, struct file *file) rc = -ENOMEM; goto error_exit; } + spin_lock_init(&cifsFile->file_info_lock); file->private_data = cifsFile; cifsFile->tlink = cifs_get_tlink(tlink); tcon = tlink_tcon(tlink); @@ -594,14 +595,14 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, is_dir_changed(file)) || (index_to_find < first_entry_in_buffer)) { /* close and restart search */ cifs_dbg(FYI, "search backing up - close and restart search\n"); - spin_lock(&cifs_file_list_lock); + spin_lock(&cfile->file_info_lock); if (server->ops->dir_needs_close(cfile)) { cfile->invalidHandle = true; - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); if (server->ops->close_dir) server->ops->close_dir(xid, tcon, &cfile->fid); } else - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); if (cfile->srch_inf.ntwrk_buf_start) { cifs_dbg(FYI, "freeing SMB ff cache buf on search rewind\n"); if (cfile->srch_inf.smallBuf) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 59727e32ed0f..e88ffe1da045 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -364,19 +364,43 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, sec_blob->DomainName.MaximumLength = 0; } -/* We do not malloc the blob, it is passed in pbuffer, because its - maximum possible size is fixed and small, making this approach cleaner. - This function returns the length of the data in the blob */ -int build_ntlmssp_auth_blob(unsigned char *pbuffer, +static int size_of_ntlmssp_blob(struct cifs_ses *ses) +{ + int sz = sizeof(AUTHENTICATE_MESSAGE) + ses->auth_key.len + - CIFS_SESS_KEY_SIZE + CIFS_CPHTXT_SIZE + 2; + + if (ses->domainName) + sz += 2 * strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN); + else + sz += 2; + + if (ses->user_name) + sz += 2 * strnlen(ses->user_name, CIFS_MAX_USERNAME_LEN); + else + sz += 2; + + return sz; +} + +int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, const struct nls_table *nls_cp) { int rc; - AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; + AUTHENTICATE_MESSAGE *sec_blob; __u32 flags; unsigned char *tmp; + rc = setup_ntlmv2_rsp(ses, nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc); + *buflen = 0; + goto setup_ntlmv2_ret; + } + *pbuffer = kmalloc(size_of_ntlmssp_blob(ses), GFP_KERNEL); + sec_blob = (AUTHENTICATE_MESSAGE *)*pbuffer; + memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); sec_blob->MessageType = NtLmAuthenticate; @@ -391,7 +415,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, flags |= NTLMSSP_NEGOTIATE_KEY_XCH; } - tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE); + tmp = *pbuffer + sizeof(AUTHENTICATE_MESSAGE); sec_blob->NegotiateFlags = cpu_to_le32(flags); sec_blob->LmChallengeResponse.BufferOffset = @@ -399,23 +423,27 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, sec_blob->LmChallengeResponse.Length = 0; sec_blob->LmChallengeResponse.MaximumLength = 0; - sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); - rc = setup_ntlmv2_rsp(ses, nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc); - goto setup_ntlmv2_ret; + sec_blob->NtChallengeResponse.BufferOffset = + cpu_to_le32(tmp - *pbuffer); + if (ses->user_name != NULL) { + memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ses->auth_key.len - CIFS_SESS_KEY_SIZE); + tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE; + + sec_blob->NtChallengeResponse.Length = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + sec_blob->NtChallengeResponse.MaximumLength = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + } else { + /* + * don't send an NT Response for anonymous access + */ + sec_blob->NtChallengeResponse.Length = 0; + sec_blob->NtChallengeResponse.MaximumLength = 0; } - memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - ses->auth_key.len - CIFS_SESS_KEY_SIZE); - tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE; - - sec_blob->NtChallengeResponse.Length = - cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); - sec_blob->NtChallengeResponse.MaximumLength = - cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); if (ses->domainName == NULL) { - sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->DomainName.Length = 0; sec_blob->DomainName.MaximumLength = 0; tmp += 2; @@ -424,14 +452,14 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName, CIFS_MAX_USERNAME_LEN, nls_cp); len *= 2; /* unicode is 2 bytes each */ - sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->DomainName.Length = cpu_to_le16(len); sec_blob->DomainName.MaximumLength = cpu_to_le16(len); tmp += len; } if (ses->user_name == NULL) { - sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->UserName.Length = 0; sec_blob->UserName.MaximumLength = 0; tmp += 2; @@ -440,13 +468,13 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name, CIFS_MAX_USERNAME_LEN, nls_cp); len *= 2; /* unicode is 2 bytes each */ - sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->UserName.Length = cpu_to_le16(len); sec_blob->UserName.MaximumLength = cpu_to_le16(len); tmp += len; } - sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->WorkstationName.Length = 0; sec_blob->WorkstationName.MaximumLength = 0; tmp += 2; @@ -455,19 +483,19 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) && !calc_seckey(ses)) { memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); - sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE); sec_blob->SessionKey.MaximumLength = cpu_to_le16(CIFS_CPHTXT_SIZE); tmp += CIFS_CPHTXT_SIZE; } else { - sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->SessionKey.Length = 0; sec_blob->SessionKey.MaximumLength = 0; } + *buflen = tmp - *pbuffer; setup_ntlmv2_ret: - *buflen = tmp - pbuffer; return rc; } @@ -670,20 +698,24 @@ sess_auth_lanman(struct sess_data *sess_data) pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; - /* no capabilities flags in old lanman negotiation */ - pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); - - /* Calculate hash with password and copy into bcc_ptr. - * Encryption Key (stored as in cryptkey) gets used if the - * security mode bit in Negottiate Protocol response states - * to use challenge/response method (i.e. Password bit is 1). - */ - rc = calc_lanman_hash(ses->password, ses->server->cryptkey, - ses->server->sec_mode & SECMODE_PW_ENCRYPT ? - true : false, lnm_session_key); - - memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; + if (ses->user_name != NULL) { + /* no capabilities flags in old lanman negotiation */ + pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); + + /* Calculate hash with password and copy into bcc_ptr. + * Encryption Key (stored as in cryptkey) gets used if the + * security mode bit in Negottiate Protocol response states + * to use challenge/response method (i.e. Password bit is 1). + */ + rc = calc_lanman_hash(ses->password, ses->server->cryptkey, + ses->server->sec_mode & SECMODE_PW_ENCRYPT ? + true : false, lnm_session_key); + + memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + } else { + pSMB->old_req.PasswordLength = 0; + } /* * can not sign if LANMAN negotiated so no need @@ -769,27 +801,32 @@ sess_auth_ntlm(struct sess_data *sess_data) capabilities = cifs_ssetup_hdr(ses, pSMB); pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); - pSMB->req_no_secext.CaseInsensitivePasswordLength = - cpu_to_le16(CIFS_AUTH_RESP_SIZE); - pSMB->req_no_secext.CaseSensitivePasswordLength = - cpu_to_le16(CIFS_AUTH_RESP_SIZE); + if (ses->user_name != NULL) { + pSMB->req_no_secext.CaseInsensitivePasswordLength = + cpu_to_le16(CIFS_AUTH_RESP_SIZE); + pSMB->req_no_secext.CaseSensitivePasswordLength = + cpu_to_le16(CIFS_AUTH_RESP_SIZE); + + /* calculate ntlm response and session key */ + rc = setup_ntlm_response(ses, sess_data->nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLM authentication\n", + rc); + goto out; + } - /* calculate ntlm response and session key */ - rc = setup_ntlm_response(ses, sess_data->nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLM authentication\n", - rc); - goto out; + /* copy ntlm response */ + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + } else { + pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; + pSMB->req_no_secext.CaseSensitivePasswordLength = 0; } - /* copy ntlm response */ - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; - if (ses->capabilities & CAP_UNICODE) { /* unicode strings must be word aligned */ if (sess_data->iov[0].iov_len % 2) { @@ -878,22 +915,26 @@ sess_auth_ntlmv2(struct sess_data *sess_data) /* LM2 password would be here if we supported it */ pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; - /* calculate nlmv2 response and session key */ - rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc); - goto out; - } + if (ses->user_name != NULL) { + /* calculate nlmv2 response and session key */ + rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc); + goto out; + } - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - ses->auth_key.len - CIFS_SESS_KEY_SIZE); - bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ses->auth_key.len - CIFS_SESS_KEY_SIZE); + bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; - /* set case sensitive password length after tilen may get - * assigned, tilen is 0 otherwise. - */ - pSMB->req_no_secext.CaseSensitivePasswordLength = - cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + /* set case sensitive password length after tilen may get + * assigned, tilen is 0 otherwise. + */ + pSMB->req_no_secext.CaseSensitivePasswordLength = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); + } else { + pSMB->req_no_secext.CaseSensitivePasswordLength = 0; + } if (ses->capabilities & CAP_UNICODE) { if (sess_data->iov[0].iov_len % 2) { @@ -1245,7 +1286,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) struct cifs_ses *ses = sess_data->ses; __u16 bytes_remaining; char *bcc_ptr; - char *ntlmsspblob = NULL; + unsigned char *ntlmsspblob = NULL; u16 blob_len; cifs_dbg(FYI, "rawntlmssp session setup authenticate phase\n"); @@ -1258,19 +1299,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) /* Build security blob before we assemble the request */ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; smb_buf = (struct smb_hdr *)pSMB; - /* - * 5 is an empirical value, large enough to hold - * authenticate message plus max 10 of av paris, - * domain, user, workstation names, flags, etc. - */ - ntlmsspblob = kzalloc(5*sizeof(struct _AUTHENTICATE_MESSAGE), - GFP_KERNEL); - if (!ntlmsspblob) { - rc = -ENOMEM; - goto out; - } - - rc = build_ntlmssp_auth_blob(ntlmsspblob, + rc = build_ntlmssp_auth_blob(&ntlmsspblob, &blob_len, ses, sess_data->nls_cp); if (rc) goto out_free_ntlmsspblob; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index fc537c29044e..efd72e1fae74 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -849,8 +849,13 @@ cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *fid, __u16 search_flags, struct cifs_search_info *srch_inf) { - return CIFSFindFirst(xid, tcon, path, cifs_sb, - &fid->netfid, search_flags, srch_inf, true); + int rc; + + rc = CIFSFindFirst(xid, tcon, path, cifs_sb, + &fid->netfid, search_flags, srch_inf, true); + if (rc) + cifs_dbg(FYI, "find first failed=%d\n", rc); + return rc; } static int @@ -1015,6 +1020,15 @@ cifs_dir_needs_close(struct cifsFileInfo *cfile) return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle; } +static bool +cifs_can_echo(struct TCP_Server_Info *server) +{ + if (server->tcpStatus == CifsGood) + return true; + + return false; +} + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -1049,6 +1063,7 @@ struct smb_version_operations smb1_operations = { .get_dfs_refer = CIFSGetDFSRefer, .qfs_tcon = cifs_qfs_tcon, .is_path_accessible = cifs_is_path_accessible, + .can_echo = cifs_can_echo, .query_path_info = cifs_query_path_info, .query_file_info = cifs_query_file_info, .get_srv_inum = cifs_get_srv_inum, diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index f9e766f464be..b2aff0c6f22c 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -260,7 +260,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) * and check it for zero before using. */ max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf; - if (!max_buf) { + if (max_buf < sizeof(struct smb2_lock_element)) { free_xid(xid); return -EINVAL; } diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index bc0bb9c34f72..238759c146ba 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -44,6 +44,7 @@ #define SMB2_OP_DELETE 7 #define SMB2_OP_HARDLINK 8 #define SMB2_OP_SET_EOF 9 +#define SMB2_OP_RMDIR 10 /* Used when constructing chained read requests. */ #define CHAINED_REQUEST 1 @@ -60,4 +61,14 @@ /* Maximum buffer size value we can send with 1 credit */ #define SMB2_MAX_BUFFER_SIZE 65536 +/* + * Maximum number of credits to keep available. + * This value is chosen somewhat arbitrarily. The Windows client + * defaults to 128 credits, the Windows server allows clients up to + * 512 credits, and the NetApp server does not limit clients at all. + * Choose a high enough value such that the client shouldn't limit + * performance. + */ +#define SMB2_MAX_CREDITS_AVAILABLE 32000 + #endif /* _SMB2_GLOB_H */ diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 899bbc86f73e..1238cd3552f9 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -80,6 +80,10 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, * SMB2_open() call. */ break; + case SMB2_OP_RMDIR: + tmprc = SMB2_rmdir(xid, tcon, fid.persistent_fid, + fid.volatile_fid); + break; case SMB2_OP_RENAME: tmprc = SMB2_rename(xid, tcon, fid.persistent_fid, fid.volatile_fid, (__le16 *)data); @@ -191,8 +195,8 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, - CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE, - NULL, SMB2_OP_DELETE); + CREATE_NOT_FILE, + NULL, SMB2_OP_RMDIR); } int @@ -262,9 +266,15 @@ smb2_set_file_info(struct inode *inode, const char *full_path, struct tcon_link *tlink; int rc; + if ((buf->CreationTime == 0) && (buf->LastAccessTime == 0) && + (buf->LastWriteTime == 0) && (buf->ChangeTime) && + (buf->Attributes == 0)) + return 0; /* would be a no op, no sense sending this */ + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); + rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf, SMB2_OP_SET_INFO); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 1c5907019045..76ccf20fbfb7 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -525,19 +525,19 @@ smb2_is_valid_lease_break(char *buffer) list_for_each(tmp1, &server->smb_ses_list) { ses = list_entry(tmp1, struct cifs_ses, smb_ses_list); - spin_lock(&cifs_file_list_lock); list_for_each(tmp2, &ses->tcon_list) { tcon = list_entry(tmp2, struct cifs_tcon, tcon_list); + spin_lock(&tcon->open_file_lock); cifs_stats_inc( &tcon->stats.cifs_stats.num_oplock_brks); if (smb2_tcon_has_lease(tcon, rsp, lw)) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); return true; } + spin_unlock(&tcon->open_file_lock); } - spin_unlock(&cifs_file_list_lock); } } spin_unlock(&cifs_tcp_ses_lock); @@ -579,7 +579,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each(tmp2, &tcon->openFileList) { cfile = list_entry(tmp2, struct cifsFileInfo, tlist); @@ -591,7 +591,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "file id match, oplock break\n"); cinode = CIFS_I(d_inode(cfile->dentry)); - + spin_lock(&cfile->file_info_lock); if (!CIFS_CACHE_WRITE(cinode) && rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE) cfile->oplock_break_cancelled = true; @@ -613,14 +613,14 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) clear_bit( CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); - + spin_unlock(&cfile->file_info_lock); queue_work(cifsiod_wq, &cfile->oplock_break); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); return true; } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "No matching file for oplock break\n"); return true; @@ -630,3 +630,47 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n"); return false; } + +void +smb2_cancelled_close_fid(struct work_struct *work) +{ + struct close_cancelled_open *cancelled = container_of(work, + struct close_cancelled_open, work); + + cifs_dbg(VFS, "Close unmatched open\n"); + + SMB2_close(0, cancelled->tcon, cancelled->fid.persistent_fid, + cancelled->fid.volatile_fid); + cifs_put_tcon(cancelled->tcon); + kfree(cancelled); +} + +int +smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server) +{ + struct smb2_hdr *hdr = (struct smb2_hdr *)buffer; + struct smb2_create_rsp *rsp = (struct smb2_create_rsp *)buffer; + struct cifs_tcon *tcon; + struct close_cancelled_open *cancelled; + + if (hdr->Command != SMB2_CREATE || hdr->Status != STATUS_SUCCESS) + return 0; + + cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL); + if (!cancelled) + return -ENOMEM; + + tcon = smb2_find_smb_tcon(server, hdr->SessionId, hdr->TreeId); + if (!tcon) { + kfree(cancelled); + return -ENOENT; + } + + cancelled->fid.persistent_fid = rsp->PersistentFileId; + cancelled->fid.volatile_fid = rsp->VolatileFileId; + cancelled->tcon = tcon; + INIT_WORK(&cancelled->work, smb2_cancelled_close_fid); + queue_work(cifsiod_wq, &cancelled->work); + + return 0; +} diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 53ccdde6ff18..1d125d3d0d89 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -282,7 +282,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) cifs_dbg(FYI, "Link Speed %lld\n", le64_to_cpu(out_buf->LinkSpeed)); } - + kfree(out_buf); return rc; } #endif /* STATS2 */ @@ -536,6 +536,7 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) server->ops->set_oplock_level(cinode, oplock, fid->epoch, &fid->purge_cache); cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); + memcpy(cfile->fid.create_guid, fid->create_guid, 16); } static void @@ -694,6 +695,7 @@ smb2_clone_range(const unsigned int xid, cchunk_out: kfree(pcchunk); + kfree(retbuf); return rc; } @@ -818,7 +820,6 @@ smb2_duplicate_extents(const unsigned int xid, { int rc; unsigned int ret_data_len; - char *retbuf = NULL; struct duplicate_extents_to_file dup_ext_buf; struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink); @@ -844,7 +845,7 @@ smb2_duplicate_extents(const unsigned int xid, FSCTL_DUPLICATE_EXTENTS_TO_FILE, true /* is_fsctl */, (char *)&dup_ext_buf, sizeof(struct duplicate_extents_to_file), - (char **)&retbuf, + NULL, &ret_data_len); if (ret_data_len > 0) @@ -867,7 +868,6 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile) { struct fsctl_set_integrity_information_req integr_info; - char *retbuf = NULL; unsigned int ret_data_len; integr_info.ChecksumAlgorithm = cpu_to_le16(CHECKSUM_TYPE_UNCHANGED); @@ -879,7 +879,7 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, FSCTL_SET_INTEGRITY_INFORMATION, true /* is_fsctl */, (char *)&integr_info, sizeof(struct fsctl_set_integrity_information_req), - (char **)&retbuf, + NULL, &ret_data_len); } @@ -909,7 +909,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); kfree(utf16_path); if (rc) { - cifs_dbg(VFS, "open dir failed\n"); + cifs_dbg(FYI, "open dir failed rc=%d\n", rc); return rc; } @@ -919,7 +919,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_query_directory(xid, tcon, fid->persistent_fid, fid->volatile_fid, 0, srch_inf); if (rc) { - cifs_dbg(VFS, "query directory failed\n"); + cifs_dbg(FYI, "query directory failed rc=%d\n", rc); SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); } return rc; @@ -1036,9 +1036,12 @@ smb2_set_lease_key(struct inode *inode, struct cifs_fid *fid) static void smb2_new_lease_key(struct cifs_fid *fid) { - get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE); + generate_random_uuid(fid->lease_key); } +#define SMB2_SYMLINK_STRUCT_SIZE \ + (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) + static int smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, char **target_path, @@ -1051,7 +1054,10 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; struct smb2_err_rsp *err_buf = NULL; struct smb2_symlink_err_rsp *symlink; - unsigned int sub_len, sub_offset; + unsigned int sub_len; + unsigned int sub_offset; + unsigned int print_len; + unsigned int print_offset; cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); @@ -1072,11 +1078,33 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, kfree(utf16_path); return -ENOENT; } + + if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) || + get_rfc1002_length(err_buf) + 4 < SMB2_SYMLINK_STRUCT_SIZE) { + kfree(utf16_path); + return -ENOENT; + } + /* open must fail on symlink - reset rc */ rc = 0; symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData; sub_len = le16_to_cpu(symlink->SubstituteNameLength); sub_offset = le16_to_cpu(symlink->SubstituteNameOffset); + print_len = le16_to_cpu(symlink->PrintNameLength); + print_offset = le16_to_cpu(symlink->PrintNameOffset); + + if (get_rfc1002_length(err_buf) + 4 < + SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) { + kfree(utf16_path); + return -ENOENT; + } + + if (get_rfc1002_length(err_buf) + 4 < + SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) { + kfree(utf16_path); + return -ENOENT; + } + *target_path = cifs_strndup_from_utf16( (char *)symlink->PathBuffer + sub_offset, sub_len, true, cifs_sb->local_nls); @@ -1483,6 +1511,7 @@ struct smb_version_operations smb20_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .handle_cancelled_mid = smb2_handle_cancelled_mid, .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, @@ -1561,6 +1590,7 @@ struct smb_version_operations smb21_operations = { .clear_stats = smb2_clear_stats, .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, + .handle_cancelled_mid = smb2_handle_cancelled_mid, .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, @@ -1642,6 +1672,7 @@ struct smb_version_operations smb30_operations = { .print_stats = smb2_print_stats, .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, + .handle_cancelled_mid = smb2_handle_cancelled_mid, .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, @@ -1729,6 +1760,7 @@ struct smb_version_operations smb311_operations = { .print_stats = smb2_print_stats, .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, + .handle_cancelled_mid = smb2_handle_cancelled_mid, .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 373b5cd1c913..6c484ddf26a9 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -103,7 +103,21 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , hdr->ProtocolId[3] = 'B'; hdr->StructureSize = cpu_to_le16(64); hdr->Command = smb2_cmd; - hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */ + if (tcon && tcon->ses && tcon->ses->server) { + struct TCP_Server_Info *server = tcon->ses->server; + + spin_lock(&server->req_lock); + /* Request up to 2 credits but don't go over the limit. */ + if (server->credits >= SMB2_MAX_CREDITS_AVAILABLE) + hdr->CreditRequest = cpu_to_le16(0); + else + hdr->CreditRequest = cpu_to_le16( + min_t(int, SMB2_MAX_CREDITS_AVAILABLE - + server->credits, 2)); + spin_unlock(&server->req_lock); + } else { + hdr->CreditRequest = cpu_to_le16(2); + } hdr->ProcessId = cpu_to_le32((__u16)current->tgid); if (!tcon) @@ -264,7 +278,7 @@ out: case SMB2_CHANGE_NOTIFY: case SMB2_QUERY_INFO: case SMB2_SET_INFO: - return -EAGAIN; + rc = -EAGAIN; } unload_nls(nls_codepage); return rc; @@ -550,8 +564,12 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) } if (rsplen != sizeof(struct validate_negotiate_info_rsp)) { - cifs_dbg(VFS, "invalid size of protocol negotiate response\n"); - return -EIO; + cifs_dbg(VFS, "invalid protocol negotiate response size: %d\n", + rsplen); + + /* relax check since Mac returns max bufsize allowed on ioctl */ + if (rsplen > CIFSMaxBufSize) + return -EIO; } /* check validate negotiate info response matches what we got earlier */ @@ -591,8 +609,9 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, u16 blob_length = 0; struct key *spnego_key = NULL; char *security_blob = NULL; - char *ntlmssp_blob = NULL; + unsigned char *ntlmssp_blob = NULL; bool use_spnego = false; /* else use raw ntlmssp */ + u64 previous_session = ses->Suid; cifs_dbg(FYI, "Session Setup\n"); @@ -630,6 +649,10 @@ ssetup_ntlmssp_authenticate: return rc; req->hdr.SessionId = 0; /* First session, not a reauthenticate */ + + /* if reconnect, we need to send previous sess id, otherwise it is 0 */ + req->PreviousSessionId = previous_session; + req->Flags = 0; /* MBZ */ /* to enable echos and oplocks */ req->hdr.CreditRequest = cpu_to_le16(3); @@ -716,13 +739,7 @@ ssetup_ntlmssp_authenticate: iov[1].iov_len = blob_length; } else if (phase == NtLmAuthenticate) { req->hdr.SessionId = ses->Suid; - ntlmssp_blob = kzalloc(sizeof(struct _NEGOTIATE_MESSAGE) + 500, - GFP_KERNEL); - if (ntlmssp_blob == NULL) { - rc = -ENOMEM; - goto ssetup_exit; - } - rc = build_ntlmssp_auth_blob(ntlmssp_blob, &blob_length, ses, + rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, nls_cp); if (rc) { cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", @@ -919,9 +936,6 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, else return -EIO; - if (tcon && tcon->bad_network_name) - return -ENOENT; - if ((tcon && tcon->seal) && ((ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) == 0)) { cifs_dbg(VFS, "encryption requested but no server support"); @@ -939,6 +953,10 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, return -EINVAL; } + /* SMB2 TREE_CONNECT request must be called with TreeId == 0 */ + if (tcon) + tcon->tid = 0; + rc = small_smb2_init(SMB2_TREE_CONNECT, tcon, (void **) &req); if (rc) { kfree(unc_path); @@ -1019,8 +1037,6 @@ tcon_exit: tcon_error_exit: if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) { cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); - if (tcon) - tcon->bad_network_name = true; } goto tcon_exit; } @@ -1173,7 +1189,7 @@ create_durable_v2_buf(struct cifs_fid *pfid) buf->dcontext.Timeout = 0; /* Should this be configurable by workload */ buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); - get_random_bytes(buf->dcontext.CreateGuid, 16); + generate_random_uuid(buf->dcontext.CreateGuid); memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16); /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */ @@ -1506,8 +1522,12 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, * than one credit. Windows typically sets this smaller, but for some * ioctls it may be useful to allow server to send more. No point * limiting what the server can send as long as fits in one credit + * Unfortunately - we can not handle more than CIFS_MAX_MSG_SIZE + * (by default, note that it can be overridden to make max larger) + * in responses (except for read responses which can be bigger. + * We may want to bump this limit up */ - req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */ + req->MaxOutputResponse = cpu_to_le32(CIFSMaxBufSize); if (is_fsctl) req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL); @@ -1809,6 +1829,54 @@ smb2_echo_callback(struct mid_q_entry *mid) add_credits(server, credits_received, CIFS_ECHO_OP); } +void smb2_reconnect_server(struct work_struct *work) +{ + struct TCP_Server_Info *server = container_of(work, + struct TCP_Server_Info, reconnect.work); + struct cifs_ses *ses; + struct cifs_tcon *tcon, *tcon2; + struct list_head tmp_list; + int tcon_exist = false; + + /* Prevent simultaneous reconnects that can corrupt tcon->rlist list */ + mutex_lock(&server->reconnect_mutex); + + INIT_LIST_HEAD(&tmp_list); + cifs_dbg(FYI, "Need negotiate, reconnecting tcons\n"); + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + if (tcon->need_reconnect) { + tcon->tc_count++; + list_add_tail(&tcon->rlist, &tmp_list); + tcon_exist = true; + } + } + } + /* + * Get the reference to server struct to be sure that the last call of + * cifs_put_tcon() in the loop below won't release the server pointer. + */ + if (tcon_exist) + server->srv_count++; + + spin_unlock(&cifs_tcp_ses_lock); + + list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) { + smb2_reconnect(SMB2_ECHO, tcon); + list_del_init(&tcon->rlist); + cifs_put_tcon(tcon); + } + + cifs_dbg(FYI, "Reconnecting tcons finished\n"); + mutex_unlock(&server->reconnect_mutex); + + /* now we can safely release srv struct */ + if (tcon_exist) + cifs_put_tcp_session(server, 1); +} + int SMB2_echo(struct TCP_Server_Info *server) { @@ -1820,6 +1888,12 @@ SMB2_echo(struct TCP_Server_Info *server) cifs_dbg(FYI, "In echo request\n"); + if (server->tcpStatus == CifsNeedNegotiate) { + /* No need to send echo on newly established connections */ + queue_delayed_work(cifsiod_wq, &server->reconnect, 0); + return rc; + } + rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req); if (rc) return rc; @@ -2038,6 +2112,7 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rdata->credits) { buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, SMB2_MAX_BUFFER_SIZE)); + buf->CreditRequest = buf->CreditCharge; spin_lock(&server->req_lock); server->credits += rdata->credits - le16_to_cpu(buf->CreditCharge); @@ -2224,6 +2299,7 @@ smb2_async_writev(struct cifs_writedata *wdata, if (wdata->credits) { req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, SMB2_MAX_BUFFER_SIZE)); + req->hdr.CreditRequest = req->hdr.CreditCharge; spin_lock(&server->req_lock); server->credits += wdata->credits - le16_to_cpu(req->hdr.CreditCharge); @@ -2577,6 +2653,22 @@ SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon, } int +SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid) +{ + __u8 delete_pending = 1; + void *data; + unsigned int size; + + data = &delete_pending; + size = 1; /* sizeof __u8 */ + + return send_set_info(xid, tcon, persistent_fid, volatile_fid, + current->tgid, FILE_DISPOSITION_INFORMATION, 1, &data, + &size); +} + +int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, __le16 *target_file) { @@ -2676,8 +2768,8 @@ copy_fs_info_to_kstatfs(struct smb2_fs_full_size_info *pfs_inf, kst->f_bsize = le32_to_cpu(pfs_inf->BytesPerSector) * le32_to_cpu(pfs_inf->SectorsPerAllocationUnit); kst->f_blocks = le64_to_cpu(pfs_inf->TotalAllocationUnits); - kst->f_bfree = le64_to_cpu(pfs_inf->ActualAvailableAllocationUnits); - kst->f_bavail = le64_to_cpu(pfs_inf->CallerAvailableAllocationUnits); + kst->f_bfree = kst->f_bavail = + le64_to_cpu(pfs_inf->CallerAvailableAllocationUnits); return; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 4af52780ec35..aacb15bd56fe 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -82,8 +82,8 @@ #define NUMBER_OF_SMB2_COMMANDS 0x0013 -/* BB FIXME - analyze following length BB */ -#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ +/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */ +#define MAX_SMB2_HDR_SIZE 0x00b0 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) @@ -276,7 +276,7 @@ struct smb2_sess_setup_req { __le32 Channel; __le16 SecurityBufferOffset; __le16 SecurityBufferLength; - __le64 PreviousSessionId; + __u64 PreviousSessionId; __u8 Buffer[1]; /* variable length GSS security buffer */ } __packed; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 79dc650c18b2..adc5234486c3 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -47,6 +47,10 @@ extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst); extern struct mid_q_entry *smb2_setup_async_request( struct TCP_Server_Info *server, struct smb_rqst *rqst); +extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server, + __u64 ses_id); +extern struct cifs_tcon *smb2_find_smb_tcon(struct TCP_Server_Info *server, + __u64 ses_id, __u32 tid); extern int smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server); extern int smb3_calc_signature(struct smb_rqst *rqst, @@ -95,6 +99,7 @@ extern int smb2_open_file(const unsigned int xid, extern int smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile); +extern void smb2_reconnect_server(struct work_struct *work); /* * SMB2 Worker functions - most of protocol specific implementation details @@ -140,6 +145,8 @@ extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, extern int SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, __le16 *target_file); +extern int SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid); extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, __le16 *target_file); @@ -154,6 +161,9 @@ extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, const u64 persistent_fid, const u64 volatile_fid, const __u8 oplock_level); +extern int smb2_handle_cancelled_mid(char *buffer, + struct TCP_Server_Info *server); +void smb2_cancelled_close_fid(struct work_struct *work); extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, struct kstatfs *FSData); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index d4c5b6f109a7..69e3b322bbfe 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -115,22 +115,68 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server) } static struct cifs_ses * -smb2_find_smb_ses(struct smb2_hdr *smb2hdr, struct TCP_Server_Info *server) +smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) { struct cifs_ses *ses; - spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - if (ses->Suid != smb2hdr->SessionId) + if (ses->Suid != ses_id) continue; - spin_unlock(&cifs_tcp_ses_lock); return ses; } + + return NULL; +} + +struct cifs_ses * +smb2_find_smb_ses(struct TCP_Server_Info *server, __u64 ses_id) +{ + struct cifs_ses *ses; + + spin_lock(&cifs_tcp_ses_lock); + ses = smb2_find_smb_ses_unlocked(server, ses_id); spin_unlock(&cifs_tcp_ses_lock); + return ses; +} + +static struct cifs_tcon * +smb2_find_smb_sess_tcon_unlocked(struct cifs_ses *ses, __u32 tid) +{ + struct cifs_tcon *tcon; + + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + if (tcon->tid != tid) + continue; + ++tcon->tc_count; + return tcon; + } + return NULL; } +/* + * Obtain tcon corresponding to the tid in the given + * cifs_ses + */ + +struct cifs_tcon * +smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid) +{ + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + spin_lock(&cifs_tcp_ses_lock); + ses = smb2_find_smb_ses_unlocked(server, ses_id); + if (!ses) { + spin_unlock(&cifs_tcp_ses_lock); + return NULL; + } + tcon = smb2_find_smb_sess_tcon_unlocked(ses, tid); + spin_unlock(&cifs_tcp_ses_lock); + + return tcon; +} int smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) @@ -143,7 +189,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; struct cifs_ses *ses; - ses = smb2_find_smb_ses(smb2_pdu, server); + ses = smb2_find_smb_ses(server, smb2_pdu->SessionId); if (!ses) { cifs_dbg(VFS, "%s: Could not find session\n", __func__); return 0; @@ -314,7 +360,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; struct cifs_ses *ses; - ses = smb2_find_smb_ses(smb2_pdu, server); + ses = smb2_find_smb_ses(server, smb2_pdu->SessionId); if (!ses) { cifs_dbg(VFS, "%s: Could not find session\n", __func__); return 0; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 87abe8ed074c..54af10204e83 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -786,9 +786,11 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, rc = wait_for_response(ses->server, midQ); if (rc != 0) { + cifs_dbg(FYI, "Cancelling wait for mid %llu\n", midQ->mid); send_cancel(ses->server, buf, midQ); spin_lock(&GlobalMid_Lock); if (midQ->mid_state == MID_REQUEST_SUBMITTED) { + midQ->mid_flags |= MID_WAIT_CANCELLED; midQ->callback = DeleteMidQEntry; spin_unlock(&GlobalMid_Lock); cifs_small_buf_release(buf); diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index ec5c8325b503..0525ebc3aea2 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -83,14 +83,13 @@ static int create_link(struct config_item *parent_item, ret = -ENOMEM; sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL); if (sl) { - sl->sl_target = config_item_get(item); spin_lock(&configfs_dirent_lock); if (target_sd->s_type & CONFIGFS_USET_DROPPING) { spin_unlock(&configfs_dirent_lock); - config_item_put(item); kfree(sl); return -ENOENT; } + sl->sl_target = config_item_get(item); list_add(&sl->sl_list, &target_sd->s_links); spin_unlock(&configfs_dirent_lock); ret = configfs_create_link(sl, parent_item->ci_dentry, diff --git a/fs/coredump.c b/fs/coredump.c index 1777331eee76..a8852293038a 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1,6 +1,7 @@ #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/freezer.h> #include <linux/mm.h> #include <linux/stat.h> #include <linux/fcntl.h> @@ -32,6 +33,9 @@ #include <linux/pipe_fs_i.h> #include <linux/oom.h> #include <linux/compat.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/path.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -396,7 +400,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state) if (core_waiters > 0) { struct core_thread *ptr; + freezer_do_not_count(); wait_for_completion(&core_state->startup); + freezer_count(); /* * Wait for all the threads to become inactive, so that * all the thread context (extended register state, like @@ -627,6 +633,8 @@ void do_coredump(const siginfo_t *siginfo) } } else { struct inode *inode; + int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW | + O_LARGEFILE | O_EXCL; if (cprm.limit < binfmt->min_coredump) goto fail_unlock; @@ -665,10 +673,27 @@ void do_coredump(const siginfo_t *siginfo) * what matters is that at least one of the two processes * writes its coredump successfully, not which one. */ - cprm.file = filp_open(cn.corename, - O_CREAT | 2 | O_NOFOLLOW | - O_LARGEFILE | O_EXCL, - 0600); + if (need_suid_safe) { + /* + * Using user namespaces, normal user tasks can change + * their current->fs->root to point to arbitrary + * directories. Since the intention of the "only dump + * with a fully qualified path" rule is to control where + * coredumps may be placed using root privileges, + * current->fs->root must not be used. Instead, use the + * root directory of init_task. + */ + struct path root; + + task_lock(&init_task); + get_fs_root(init_task.fs, &root); + task_unlock(&init_task); + cprm.file = file_open_root(root.dentry, root.mnt, + cn.corename, open_flags, 0600); + path_put(&root); + } else { + cprm.file = filp_open(cn.corename, open_flags, 0600); + } if (IS_ERR(cprm.file)) goto fail_unlock; @@ -785,3 +810,21 @@ int dump_align(struct coredump_params *cprm, int align) return mod ? dump_skip(cprm, align - mod) : 1; } EXPORT_SYMBOL(dump_align); + +/* + * Ensures that file size is big enough to contain the current file + * postion. This prevents gdb from complaining about a truncated file + * if the last "write" to the file was dump_skip. + */ +void dump_truncate(struct coredump_params *cprm) +{ + struct file *file = cprm->file; + loff_t offset; + + if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + offset = file->f_op->llseek(file, 0, SEEK_CUR); + if (i_size_read(file->f_mapping->host) < offset) + do_truncate(file->f_path.dentry, offset, 0, file); + } +} +EXPORT_SYMBOL(dump_truncate); diff --git a/fs/dcache.c b/fs/dcache.c index 877bcbbd03ff..3ed642e0a0c2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -269,6 +269,33 @@ static inline int dname_external(const struct dentry *dentry) return dentry->d_name.name != dentry->d_iname; } +void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + if (unlikely(dname_external(dentry))) { + struct external_name *p = external_name(dentry); + atomic_inc(&p->u.count); + spin_unlock(&dentry->d_lock); + name->name = p->name; + } else { + memcpy(name->inline_name, dentry->d_iname, DNAME_INLINE_LEN); + spin_unlock(&dentry->d_lock); + name->name = name->inline_name; + } +} +EXPORT_SYMBOL(take_dentry_name_snapshot); + +void release_dentry_name_snapshot(struct name_snapshot *name) +{ + if (unlikely(name->name != name->inline_name)) { + struct external_name *p; + p = container_of(name->name, struct external_name, name[0]); + if (unlikely(atomic_dec_and_test(&p->u.count))) + kfree_rcu(p, u.head); + } +} +EXPORT_SYMBOL(release_dentry_name_snapshot); + static inline void __d_set_inode_and_type(struct dentry *dentry, struct inode *inode, unsigned type_flags) @@ -578,7 +605,6 @@ static struct dentry *dentry_kill(struct dentry *dentry) failed: spin_unlock(&dentry->d_lock); - cpu_relax(); return dentry; /* try again with same dentry */ } @@ -752,6 +778,8 @@ void dput(struct dentry *dentry) return; repeat: + might_sleep(); + rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); @@ -783,8 +811,10 @@ repeat: kill_it: dentry = dentry_kill(dentry); - if (dentry) + if (dentry) { + cond_resched(); goto repeat; + } } EXPORT_SYMBOL(dput); @@ -1125,11 +1155,12 @@ void shrink_dcache_sb(struct super_block *sb) LIST_HEAD(dispose); freed = list_lru_walk(&sb->s_dentry_lru, - dentry_lru_isolate_shrink, &dispose, UINT_MAX); + dentry_lru_isolate_shrink, &dispose, 1024); this_cpu_sub(nr_dentry_unused, freed); shrink_dentry_list(&dispose); - } while (freed > 0); + cond_resched(); + } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); @@ -1319,8 +1350,11 @@ int d_set_mounted(struct dentry *dentry) } spin_lock(&dentry->d_lock); if (!d_unlinked(dentry)) { - dentry->d_flags |= DCACHE_MOUNTED; - ret = 0; + ret = -EBUSY; + if (!d_mountpoint(dentry)) { + dentry->d_flags |= DCACHE_MOUNTED; + ret = 0; + } } spin_unlock(&dentry->d_lock); out: @@ -1618,7 +1652,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) struct dentry *dentry = __d_alloc(parent->d_sb, name); if (!dentry) return NULL; - + dentry->d_flags |= DCACHE_RCUACCESS; spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject @@ -1666,7 +1700,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | - DCACHE_OP_SELECT_INODE)); + DCACHE_OP_SELECT_INODE | + DCACHE_OP_REAL)); dentry->d_op = op; if (!op) return; @@ -1684,6 +1719,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) dentry->d_flags |= DCACHE_OP_PRUNE; if (op->d_select_inode) dentry->d_flags |= DCACHE_OP_SELECT_INODE; + if (op->d_real) + dentry->d_flags |= DCACHE_OP_REAL; } EXPORT_SYMBOL(d_set_d_op); @@ -2410,7 +2447,6 @@ static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b) { BUG_ON(!d_unhashed(entry)); hlist_bl_lock(b); - entry->d_flags |= DCACHE_RCUACCESS; hlist_bl_add_head_rcu(&entry->d_hash, b); hlist_bl_unlock(b); } @@ -2629,6 +2665,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, /* ... and switch them in the tree */ if (IS_ROOT(dentry)) { /* splicing a tree */ + dentry->d_flags |= DCACHE_RCUACCESS; dentry->d_parent = target->d_parent; target->d_parent = target; list_del_init(&target->d_child); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index b7fcc0de0b2f..e49ba072bd64 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -457,7 +457,7 @@ struct dentry *debugfs_create_automount(const char *name, if (unlikely(!inode)) return failed_creating(dentry); - inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + make_empty_dir_inode(inode); inode->i_flags |= S_AUTOMOUNT; inode->i_private = data; dentry->d_fsdata = (void *)f; @@ -669,7 +669,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, { int error; struct dentry *dentry = NULL, *trap; - const char *old_name; + struct name_snapshot old_name; trap = lock_rename(new_dir, old_dir); /* Source or destination directories don't exist? */ @@ -684,19 +684,19 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry)) goto exit; - old_name = fsnotify_oldname_init(old_dentry->d_name.name); + take_dentry_name_snapshot(&old_name, old_dentry); error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir), dentry); if (error) { - fsnotify_oldname_free(old_name); + release_dentry_name_snapshot(&old_name); goto exit; } d_move(old_dentry, dentry); - fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name, + fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name.name, d_is_dir(old_dentry), NULL, old_dentry); - fsnotify_oldname_free(old_name); + release_dentry_name_snapshot(&old_name); unlock_rename(new_dir, old_dir); dput(dentry); return old_dentry; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 706de324f2a6..c82edb049117 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -128,6 +128,7 @@ static const match_table_t tokens = { struct pts_fs_info { struct ida allocated_ptys; struct pts_mount_opts mount_opts; + struct super_block *sb; struct dentry *ptmx_dentry; }; @@ -358,7 +359,7 @@ static const struct super_operations devpts_sops = { .show_options = devpts_show_options, }; -static void *new_pts_fs_info(void) +static void *new_pts_fs_info(struct super_block *sb) { struct pts_fs_info *fsi; @@ -369,6 +370,7 @@ static void *new_pts_fs_info(void) ida_init(&fsi->allocated_ptys); fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + fsi->sb = sb; return fsi; } @@ -384,7 +386,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_op = &devpts_sops; s->s_time_gran = 1; - s->s_fs_info = new_pts_fs_info(); + s->s_fs_info = new_pts_fs_info(s); if (!s->s_fs_info) goto fail; @@ -524,17 +526,14 @@ static struct file_system_type devpts_fs_type = { * to the System V naming convention */ -int devpts_new_index(struct inode *ptmx_inode) +int devpts_new_index(struct pts_fs_info *fsi) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi; int index; int ida_ret; - if (!sb) + if (!fsi) return -ENODEV; - fsi = DEVPTS_SB(sb); retry: if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) return -ENOMEM; @@ -564,11 +563,8 @@ retry: return index; } -void devpts_kill_index(struct inode *ptmx_inode, int idx) +void devpts_kill_index(struct pts_fs_info *fsi, int idx) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi = DEVPTS_SB(sb); - mutex_lock(&allocated_ptys_lock); ida_remove(&fsi->allocated_ptys, idx); pty_count--; @@ -578,21 +574,25 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx) /* * pty code needs to hold extra references in case of last /dev/tty close */ - -void devpts_add_ref(struct inode *ptmx_inode) +struct pts_fs_info *devpts_get_ref(struct inode *ptmx_inode, struct file *file) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct super_block *sb; + struct pts_fs_info *fsi; + + sb = pts_sb_from_inode(ptmx_inode); + if (!sb) + return NULL; + fsi = DEVPTS_SB(sb); + if (!fsi) + return NULL; atomic_inc(&sb->s_active); - ihold(ptmx_inode); + return fsi; } -void devpts_del_ref(struct inode *ptmx_inode) +void devpts_put_ref(struct pts_fs_info *fsi) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - - iput(ptmx_inode); - deactivate_super(sb); + deactivate_super(fsi->sb); } /** @@ -604,22 +604,21 @@ void devpts_del_ref(struct inode *ptmx_inode) * * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill. */ -struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, +struct inode *devpts_pty_new(struct pts_fs_info *fsi, dev_t device, int index, void *priv) { struct dentry *dentry; - struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct super_block *sb; struct inode *inode; struct dentry *root; - struct pts_fs_info *fsi; struct pts_mount_opts *opts; char s[12]; - if (!sb) + if (!fsi) return ERR_PTR(-ENODEV); + sb = fsi->sb; root = sb->s_root; - fsi = DEVPTS_SB(sb); opts = &fsi->mount_opts; inode = new_inode(sb); diff --git a/fs/direct-io.c b/fs/direct-io.c index 01171d8a6ee9..c772fdf36cd9 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -575,7 +575,7 @@ static int dio_set_defer_completion(struct dio *dio) /* * Call into the fs to map some more disk blocks. We record the current number * of available blocks at sdio->blocks_available. These are in units of the - * fs blocksize, (1 << inode->i_blkbits). + * fs blocksize, i_blocksize(inode). * * The fs is allowed to map lots of blocks at once. If it wants to do that, * it uses the passed inode-relative block number as the file offset, as usual. diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 3a37bd3f9637..9d7a4a714907 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1607,16 +1607,12 @@ void dlm_lowcomms_stop(void) mutex_lock(&connections_lock); dlm_allow_conn = 0; foreach_conn(stop_conn); + clean_writequeues(); + foreach_conn(free_conn); mutex_unlock(&connections_lock); work_stop(); - mutex_lock(&connections_lock); - clean_writequeues(); - - foreach_conn(free_conn); - - mutex_unlock(&connections_lock); kmem_cache_destroy(con_cache); } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 173b3873a4f4..e40c440a4555 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -355,6 +355,10 @@ static int dlm_device_register(struct dlm_ls *ls, char *name) error = misc_register(&ls->ls_device); if (error) { kfree(ls->ls_device.name); + /* this has to be set to NULL + * to avoid a double-free in dlm_device_deregister + */ + ls->ls_device.name = NULL; } fail: return error; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index feef8a9c4de7..27794b137b24 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -112,7 +112,6 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx) .sb = inode->i_sb, }; lower_file = ecryptfs_file_to_lower(file); - lower_file->f_pos = ctx->pos; rc = iterate_dir(lower_file, &buf.ctx); ctx->pos = buf.ctx.pos; if (rc < 0) @@ -170,6 +169,19 @@ out: return rc; } +static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct file *lower_file = ecryptfs_file_to_lower(file); + /* + * Don't allow mmap on top of file systems that don't support it + * natively. If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs + * allows recursive mounting, this will need to be extended. + */ + if (!lower_file->f_op->mmap) + return -ENODEV; + return generic_file_mmap(file, vma); +} + /** * ecryptfs_open * @inode: inode speciying file to open @@ -223,14 +235,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); - if (d_is_dir(ecryptfs_dentry)) { - ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); - mutex_lock(&crypt_stat->cs_mutex); - crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); - mutex_unlock(&crypt_stat->cs_mutex); - rc = 0; - goto out; - } rc = read_or_initialize_metadata(ecryptfs_dentry); if (rc) goto out_put; @@ -247,6 +251,45 @@ out: return rc; } +/** + * ecryptfs_dir_open + * @inode: inode speciying file to open + * @file: Structure to return filled in + * + * Opens the file specified by inode. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_dir_open(struct inode *inode, struct file *file) +{ + struct dentry *ecryptfs_dentry = file->f_path.dentry; + /* Private value of ecryptfs_dentry allocated in + * ecryptfs_lookup() */ + struct ecryptfs_file_info *file_info; + struct file *lower_file; + + /* Released in ecryptfs_release or end of function if failure */ + file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); + ecryptfs_set_file_private(file, file_info); + if (unlikely(!file_info)) { + ecryptfs_printk(KERN_ERR, + "Error attempting to allocate memory\n"); + return -ENOMEM; + } + lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry), + file->f_flags, current_cred()); + if (IS_ERR(lower_file)) { + printk(KERN_ERR "%s: Error attempting to initialize " + "the lower file for the dentry with name " + "[%pd]; rc = [%ld]\n", __func__, + ecryptfs_dentry, PTR_ERR(lower_file)); + kmem_cache_free(ecryptfs_file_info_cache, file_info); + return PTR_ERR(lower_file); + } + ecryptfs_set_file_lower(file, lower_file); + return 0; +} + static int ecryptfs_flush(struct file *file, fl_owner_t td) { struct file *lower_file = ecryptfs_file_to_lower(file); @@ -267,6 +310,19 @@ static int ecryptfs_release(struct inode *inode, struct file *file) return 0; } +static int ecryptfs_dir_release(struct inode *inode, struct file *file) +{ + fput(ecryptfs_file_to_lower(file)); + kmem_cache_free(ecryptfs_file_info_cache, + ecryptfs_file_to_private(file)); + return 0; +} + +static loff_t ecryptfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return vfs_llseek(ecryptfs_file_to_lower(file), offset, whence); +} + static int ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { @@ -346,25 +402,21 @@ const struct file_operations ecryptfs_dir_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .open = ecryptfs_open, - .flush = ecryptfs_flush, - .release = ecryptfs_release, + .open = ecryptfs_dir_open, + .release = ecryptfs_dir_release, .fsync = ecryptfs_fsync, - .fasync = ecryptfs_fasync, - .splice_read = generic_file_splice_read, - .llseek = default_llseek, + .llseek = ecryptfs_dir_llseek, }; const struct file_operations ecryptfs_main_fops = { .llseek = generic_file_llseek, .read_iter = ecryptfs_read_update_atime, .write_iter = generic_file_write_iter, - .iterate = ecryptfs_readdir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ecryptfs_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1e009cad8d5c..1b08556776ce 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -518,8 +518,13 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq) wait_queue_head_t *whead; rcu_read_lock(); - /* If it is cleared by POLLFREE, it should be rcu-safe */ - whead = rcu_dereference(pwq->whead); + /* + * If it is cleared by POLLFREE, it should be rcu-safe. + * If we read NULL we need a barrier paired with + * smp_store_release() in ep_poll_callback(), otherwise + * we rely on whead->lock. + */ + whead = smp_load_acquire(&pwq->whead); if (whead) remove_wait_queue(whead, &pwq->wait); rcu_read_unlock(); @@ -1003,17 +1008,6 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; - if ((unsigned long)key & POLLFREE) { - ep_pwq_from_wait(wait)->whead = NULL; - /* - * whead = NULL above can race with ep_remove_wait_queue() - * which can do another remove_wait_queue() after us, so we - * can't use __remove_wait_queue(). whead->lock is held by - * the caller. - */ - list_del_init(&wait->task_list); - } - spin_lock_irqsave(&ep->lock, flags); /* @@ -1078,6 +1072,23 @@ out_unlock: if (pwake) ep_poll_safewake(&ep->poll_wait); + + if ((unsigned long)key & POLLFREE) { + /* + * If we race with ep_remove_wait_queue() it can miss + * ->whead = NULL and do another remove_wait_queue() after + * us, so we can't use __remove_wait_queue(). + */ + list_del_init(&wait->task_list); + /* + * ->whead != NULL protects us from the race with ep_free() + * or ep_remove(), ep_remove_wait_queue() takes whead->lock + * held by the caller. Once we nullify it, nothing protects + * ep/epi or even wait. + */ + smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); + } + return 1; } diff --git a/fs/exec.c b/fs/exec.c index b06623a9347f..9c5ee2a880aa 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -19,7 +19,7 @@ * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary - * formats. + * formats. */ #include <linux/slab.h> @@ -56,6 +56,7 @@ #include <linux/pipe_fs_i.h> #include <linux/oom.h> #include <linux/compat.h> +#include <linux/user_namespace.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -205,7 +206,24 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, if (write) { unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; - struct rlimit *rlim; + unsigned long ptr_size, limit; + + /* + * Since the stack will hold pointers to the strings, we + * must account for them as well. + * + * The size calculation is the entire vma while each arg page is + * built, so each time we get here it's calculating how far it + * is currently (rather than each call being just the newly + * added size from the arg page). As a result, we need to + * always add the entire size of the pointers, so that on the + * last call to get_arg_page() we'll actually have the entire + * correct size. + */ + ptr_size = (bprm->argc + bprm->envc) * sizeof(void *); + if (ptr_size > ULONG_MAX - size) + goto fail; + size += ptr_size; acct_arg_size(bprm, size / PAGE_SIZE); @@ -217,20 +235,24 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return page; /* - * Limit to 1/4-th the stack size for the argv+env strings. + * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM + * (whichever is smaller) for the argv+env strings. * This ensures that: * - the remaining binfmt code will not run out of stack space, * - the program will have a reasonable amount of stack left * to work from. */ - rlim = current->signal->rlim; - if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) { - put_page(page); - return NULL; - } + limit = _STK_LIM / 4 * 3; + limit = min(limit, rlimit(RLIMIT_STACK) / 4); + if (size > limit) + goto fail; } return page; + +fail: + put_page(page); + return NULL; } static void put_arg_page(struct page *page) @@ -1114,6 +1136,13 @@ int flush_old_exec(struct linux_binprm * bprm) flush_thread(); current->personality &= ~bprm->per_clear; + /* + * We have to apply CLOEXEC before we change whether the process is + * dumpable (in setup_new_exec) to avoid a race with a process in userspace + * trying to access the should-be-closed file descriptors of a process + * undergoing exec(2). + */ + do_close_on_exec(current->files); return 0; out: @@ -1123,8 +1152,22 @@ EXPORT_SYMBOL(flush_old_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { - if (inode_permission(file_inode(file), MAY_READ) < 0) + struct inode *inode = file_inode(file); + if (inode_permission(inode, MAY_READ) < 0) { + struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; + + /* Ensure mm->user_ns contains the executable */ + user_ns = old = bprm->mm->user_ns; + while ((user_ns != &init_user_ns) && + !privileged_wrt_inode_uidgid(user_ns, inode)) + user_ns = user_ns->parent; + + if (old != user_ns) { + bprm->mm->user_ns = get_user_ns(user_ns); + put_user_ns(old); + } + } } EXPORT_SYMBOL(would_dump); @@ -1154,7 +1197,6 @@ void setup_new_exec(struct linux_binprm * bprm) !gid_eq(bprm->cred->gid, current_egid())) { current->pdeath_signal = 0; } else { - would_dump(bprm, bprm->file); if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) set_dumpable(current->mm, suid_dumpable); } @@ -1163,7 +1205,6 @@ void setup_new_exec(struct linux_binprm * bprm) group */ current->self_exec_id++; flush_signal_handlers(current, 0); - do_close_on_exec(current->files); } EXPORT_SYMBOL(setup_new_exec); @@ -1254,7 +1295,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) unsigned n_fs; if (p->ptrace) { - if (p->ptrace & PT_PTRACE_CAP) + if (ptracer_capable(p, current_user_ns())) bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP; else bprm->unsafe |= LSM_UNSAFE_PTRACE; @@ -1587,6 +1628,8 @@ static int do_execveat_common(int fd, struct filename *filename, if (retval < 0) goto out; + would_dump(bprm, bprm->file); + retval = exec_binprm(bprm); if (retval < 0) goto out; diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 27695e6f4e46..d6aeb84e90b6 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -193,15 +193,11 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; - else { - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - if (error == 0) - acl = NULL; - } + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); } break; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 69b1e73026a5..c3fe1e323951 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -196,15 +196,11 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; - else { - inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - if (error == 0) - acl = NULL; - } + inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); } break; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index fe1f50fe764f..f97110461c19 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -208,6 +208,9 @@ static int ext4_init_block_bitmap(struct super_block *sb, memset(bh->b_data, 0, sb->s_blocksize); bit_max = ext4_num_base_meta_clusters(sb, block_group); + if ((bit_max >> 3) >= bh->b_size) + return -EFSCORRUPTED; + for (bit = 0; bit < bit_max; bit++) ext4_set_bit(bit, bh->b_data); diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 1a0835073663..f240cef8b326 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -34,6 +34,7 @@ #include <linux/random.h> #include <linux/scatterlist.h> #include <linux/spinlock_types.h> +#include <linux/namei.h> #include "ext4_extents.h" #include "xattr.h" @@ -93,7 +94,8 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) * Return: An allocated and initialized encryption context on success; error * value or NULL otherwise. */ -struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) +struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode, + gfp_t gfp_flags) { struct ext4_crypto_ctx *ctx = NULL; int res = 0; @@ -120,7 +122,7 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) list_del(&ctx->free_list); spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); if (!ctx) { - ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS); + ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, gfp_flags); if (!ctx) { res = -ENOMEM; goto out; @@ -257,7 +259,8 @@ static int ext4_page_crypto(struct inode *inode, ext4_direction_t rw, pgoff_t index, struct page *src_page, - struct page *dest_page) + struct page *dest_page, + gfp_t gfp_flags) { u8 xts_tweak[EXT4_XTS_TWEAK_SIZE]; @@ -268,7 +271,7 @@ static int ext4_page_crypto(struct inode *inode, struct crypto_ablkcipher *tfm = ci->ci_ctfm; int res = 0; - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = ablkcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", @@ -309,9 +312,10 @@ static int ext4_page_crypto(struct inode *inode, return 0; } -static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx) +static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx, + gfp_t gfp_flags) { - ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, GFP_NOWAIT); + ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, gfp_flags); if (ctx->w.bounce_page == NULL) return ERR_PTR(-ENOMEM); ctx->flags |= EXT4_WRITE_PATH_FL; @@ -334,7 +338,8 @@ static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx) * error value or NULL. */ struct page *ext4_encrypt(struct inode *inode, - struct page *plaintext_page) + struct page *plaintext_page, + gfp_t gfp_flags) { struct ext4_crypto_ctx *ctx; struct page *ciphertext_page = NULL; @@ -342,17 +347,17 @@ struct page *ext4_encrypt(struct inode *inode, BUG_ON(!PageLocked(plaintext_page)); - ctx = ext4_get_crypto_ctx(inode); + ctx = ext4_get_crypto_ctx(inode, gfp_flags); if (IS_ERR(ctx)) return (struct page *) ctx; /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx); + ciphertext_page = alloc_bounce_page(ctx, gfp_flags); if (IS_ERR(ciphertext_page)) goto errout; ctx->w.control_page = plaintext_page; err = ext4_page_crypto(inode, EXT4_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page); + plaintext_page, ciphertext_page, gfp_flags); if (err) { ciphertext_page = ERR_PTR(err); errout: @@ -380,8 +385,8 @@ int ext4_decrypt(struct page *page) { BUG_ON(!PageLocked(page)); - return ext4_page_crypto(page->mapping->host, - EXT4_DECRYPT, page->index, page, page); + return ext4_page_crypto(page->mapping->host, EXT4_DECRYPT, + page->index, page, page, GFP_NOFS); } int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) @@ -402,11 +407,11 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE); - ctx = ext4_get_crypto_ctx(inode); + ctx = ext4_get_crypto_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ciphertext_page = alloc_bounce_page(ctx); + ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); if (IS_ERR(ciphertext_page)) { err = PTR_ERR(ciphertext_page); goto errout; @@ -414,11 +419,12 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) while (len--) { err = ext4_page_crypto(inode, EXT4_ENCRYPT, lblk, - ZERO_PAGE(0), ciphertext_page); + ZERO_PAGE(0), ciphertext_page, + GFP_NOFS); if (err) goto errout; - bio = bio_alloc(GFP_KERNEL, 1); + bio = bio_alloc(GFP_NOWAIT, 1); if (!bio) { err = -ENOMEM; goto errout; @@ -469,3 +475,61 @@ uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size) return size; return 0; } + +/* + * Validate dentries for encrypted directories to make sure we aren't + * potentially caching stale data after a key has been added or + * removed. + */ +static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct dentry *dir; + struct ext4_crypt_info *ci; + int dir_has_key, cached_with_key; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + dir = dget_parent(dentry); + if (!ext4_encrypted_inode(d_inode(dir))) { + dput(dir); + return 0; + } + ci = EXT4_I(d_inode(dir))->i_crypt_info; + + /* this should eventually be an flag in d_flags */ + cached_with_key = dentry->d_fsdata != NULL; + dir_has_key = (ci != NULL); + dput(dir); + + /* + * If the dentry was cached without the key, and it is a + * negative dentry, it might be a valid name. We can't check + * if the key has since been made available due to locking + * reasons, so we fail the validation so ext4_lookup() can do + * this check. + * + * We also fail the validation if the dentry was created with + * the key present, but we no longer have the key, or vice versa. + */ + if ((!cached_with_key && d_is_negative(dentry)) || + (!cached_with_key && dir_has_key) || + (cached_with_key && !dir_has_key)) { +#if 0 /* Revalidation debug */ + char buf[80]; + char *cp = simple_dname(dentry, buf, sizeof(buf)); + + if (IS_ERR(cp)) + cp = (char *) "???"; + pr_err("revalidate: %s %p %d %d %d\n", cp, dentry->d_fsdata, + cached_with_key, d_is_negative(dentry), + dir_has_key); +#endif + return 0; + } + return 1; +} + +const struct dentry_operations ext4_encrypted_d_ops = { + .d_revalidate = ext4_d_revalidate, +}; diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 2fbef8a14760..2cfe3ffc276f 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -343,7 +343,7 @@ int _ext4_fname_disk_to_usr(struct inode *inode, memcpy(buf+4, &hinfo->minor_hash, 4); } else memset(buf, 0, 8); - memcpy(buf + 8, iname->name + iname->len - 16, 16); + memcpy(buf + 8, iname->name + ((iname->len - 17) & ~15), 16); oname->name[0] = '_'; ret = digest_encode(buf, 24, oname->name+1); oname->len = ret + 1; diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 9a16d1e75a49..505f8afde57c 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -88,8 +88,6 @@ void ext4_free_crypt_info(struct ext4_crypt_info *ci) if (!ci) return; - if (ci->ci_keyring_key) - key_put(ci->ci_keyring_key); crypto_free_ablkcipher(ci->ci_ctfm); kmem_cache_free(ext4_crypt_info_cachep, ci); } @@ -111,7 +109,7 @@ void ext4_free_encryption_info(struct inode *inode, ext4_free_crypt_info(ci); } -int _ext4_get_encryption_info(struct inode *inode) +int ext4_get_encryption_info(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_crypt_info *crypt_info; @@ -128,22 +126,15 @@ int _ext4_get_encryption_info(struct inode *inode) char mode; int res; + if (ei->i_crypt_info) + return 0; + if (!ext4_read_workqueue) { res = ext4_init_crypto(); if (res) return res; } -retry: - crypt_info = ACCESS_ONCE(ei->i_crypt_info); - if (crypt_info) { - if (!crypt_info->ci_keyring_key || - key_validate(crypt_info->ci_keyring_key) == 0) - return 0; - ext4_free_encryption_info(inode, crypt_info); - goto retry; - } - res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx)); @@ -166,7 +157,6 @@ retry: crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; - crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); if (S_ISREG(inode->i_mode)) @@ -206,7 +196,6 @@ retry: keyring_key = NULL; goto out; } - crypt_info->ci_keyring_key = keyring_key; if (keyring_key->type != &key_type_logon) { printk_once(KERN_WARNING "ext4: key type must be logon\n"); @@ -253,16 +242,13 @@ got_key: ext4_encryption_key_size(mode)); if (res) goto out; - memzero_explicit(raw_key, sizeof(raw_key)); - if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) != NULL) { - ext4_free_crypt_info(crypt_info); - goto retry; - } - return 0; + if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) == NULL) + crypt_info = NULL; out: if (res == -ENOKEY) res = 0; + key_put(keyring_key); ext4_free_crypt_info(crypt_info); memzero_explicit(raw_key, sizeof(raw_key)); return res; diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index ad050698143f..e4f4fc4e56ab 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -102,6 +102,9 @@ static int ext4_create_encryption_context_from_policy( int ext4_process_policy(const struct ext4_encryption_policy *policy, struct inode *inode) { + if (!inode_owner_or_capable(inode)) + return -EACCES; + if (policy->version != 0) return -EINVAL; @@ -145,20 +148,38 @@ int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy) int ext4_is_child_context_consistent_with_parent(struct inode *parent, struct inode *child) { - struct ext4_crypt_info *parent_ci, *child_ci; + const struct ext4_crypt_info *parent_ci, *child_ci; + struct ext4_encryption_context parent_ctx, child_ctx; int res; - if ((parent == NULL) || (child == NULL)) { - pr_err("parent %p child %p\n", parent, child); - WARN_ON(1); /* Should never happen */ - return 0; - } - /* no restrictions if the parent directory is not encrypted */ + /* No restrictions on file types which are never encrypted */ + if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && + !S_ISLNK(child->i_mode)) + return 1; + + /* No restrictions if the parent directory is unencrypted */ if (!ext4_encrypted_inode(parent)) return 1; - /* if the child directory is not encrypted, this is always a problem */ + + /* Encrypted directories must not contain unencrypted files */ if (!ext4_encrypted_inode(child)) return 0; + + /* + * Both parent and child are encrypted, so verify they use the same + * encryption policy. Compare the fscrypt_info structs if the keys are + * available, otherwise retrieve and compare the fscrypt_contexts. + * + * Note that the fscrypt_context retrieval will be required frequently + * when accessing an encrypted directory tree without the key. + * Performance-wise this is not a big deal because we already don't + * really optimize for file access without the key (to the extent that + * such access is even possible), given that any attempted access + * already causes a fscrypt_context retrieval and keyring search. + * + * In any case, if an unexpected error occurs, fall back to "forbidden". + */ + res = ext4_get_encryption_info(parent); if (res) return 0; @@ -167,17 +188,35 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent, return 0; parent_ci = EXT4_I(parent)->i_crypt_info; child_ci = EXT4_I(child)->i_crypt_info; - if (!parent_ci && !child_ci) - return 1; - if (!parent_ci || !child_ci) + if (parent_ci && child_ci) { + return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key, + EXT4_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == + child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags); + } + + res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &parent_ctx, sizeof(parent_ctx)); + if (res != sizeof(parent_ctx)) + return 0; + + res = ext4_xattr_get(child, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &child_ctx, sizeof(child_ctx)); + if (res != sizeof(child_ctx)) return 0; - return (memcmp(parent_ci->ci_master_key, - child_ci->ci_master_key, - EXT4_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags)); + return memcmp(parent_ctx.master_key_descriptor, + child_ctx.master_key_descriptor, + EXT4_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ctx.contents_encryption_mode == + child_ctx.contents_encryption_mode) && + (parent_ctx.filenames_encryption_mode == + child_ctx.filenames_encryption_mode) && + (parent_ctx.flags == child_ctx.flags); } /** diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 1d1bca74f844..6d17f31a31d7 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -111,6 +111,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) int dir_has_error = 0; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + if (ext4_encrypted_inode(inode)) { + err = ext4_get_encryption_info(inode); + if (err && err != -ENOKEY) + return err; + } + if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); if (err != ERR_BAD_DX_DIR) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cc7ca4e87144..c8ad14c697c4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -221,6 +221,7 @@ struct ext4_io_submit { #define EXT4_MAX_BLOCK_SIZE 65536 #define EXT4_MIN_BLOCK_LOG_SIZE 10 #define EXT4_MAX_BLOCK_LOG_SIZE 16 +#define EXT4_MAX_CLUSTER_LOG_SIZE 30 #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) #else @@ -850,6 +851,29 @@ do { \ #include "extents_status.h" /* + * Lock subclasses for i_data_sem in the ext4_inode_info structure. + * + * These are needed to avoid lockdep false positives when we need to + * allocate blocks to the quota inode during ext4_map_blocks(), while + * holding i_data_sem for a normal (non-quota) inode. Since we don't + * do quota tracking for the quota inode, this avoids deadlock (as + * well as infinite recursion, since it isn't turtles all the way + * down...) + * + * I_DATA_SEM_NORMAL - Used for most inodes + * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode + * where the second inode has larger inode number + * than the first + * I_DATA_SEM_QUOTA - Used for quota inodes only + */ +enum { + I_DATA_SEM_NORMAL = 0, + I_DATA_SEM_OTHER, + I_DATA_SEM_QUOTA, +}; + + +/* * fourth extended file system inode data in memory */ struct ext4_inode_info { @@ -910,6 +934,15 @@ struct ext4_inode_info { * by other means, so we have i_data_sem. */ struct rw_semaphore i_data_sem; + /* + * i_mmap_sem is for serializing page faults with truncate / punch hole + * operations. We have to make sure that new page cannot be faulted in + * a section of the inode that is being punched. We cannot easily use + * i_data_sem for this since we need protection for the whole punch + * operation and i_data_sem ranks below transaction start so we have + * to occasionally drop it. + */ + struct rw_semaphore i_mmap_sem; struct inode vfs_inode; struct jbd2_inode *jinode; @@ -2228,13 +2261,16 @@ extern struct kmem_cache *ext4_crypt_info_cachep; bool ext4_valid_contents_enc_mode(uint32_t mode); uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size); extern struct workqueue_struct *ext4_read_workqueue; -struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode); +struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode, + gfp_t gfp_flags); void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx); void ext4_restore_control_page(struct page *data_page); struct page *ext4_encrypt(struct inode *inode, - struct page *plaintext_page); + struct page *plaintext_page, + gfp_t gfp_flags); int ext4_decrypt(struct page *page); int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex); +extern const struct dentry_operations ext4_encrypted_d_ops; #ifdef CONFIG_EXT4_FS_ENCRYPTION int ext4_init_crypto(void); @@ -2297,23 +2333,11 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } /* crypto_key.c */ void ext4_free_crypt_info(struct ext4_crypt_info *ci); void ext4_free_encryption_info(struct inode *inode, struct ext4_crypt_info *ci); -int _ext4_get_encryption_info(struct inode *inode); #ifdef CONFIG_EXT4_FS_ENCRYPTION int ext4_has_encryption_key(struct inode *inode); -static inline int ext4_get_encryption_info(struct inode *inode) -{ - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - - if (!ci || - (ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD))))) - return _ext4_get_encryption_info(inode); - return 0; -} +int ext4_get_encryption_info(struct inode *inode); static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) { @@ -2484,6 +2508,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); @@ -2848,6 +2873,9 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) return changed; } +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len); + struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index ac7d4e813796..1b17b05b9f4d 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -78,7 +78,6 @@ struct ext4_crypt_info { char ci_filename_mode; char ci_flags; struct crypto_ablkcipher *ci_ctfm; - struct key *ci_keyring_key; char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE]; }; diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 5f5846211095..f817ed58f5ad 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -395,17 +395,19 @@ static inline int ext4_inode_journal_mode(struct inode *inode) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ /* We do not support data journalling with delayed allocation */ if (!S_ISREG(inode->i_mode) || - test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC)) + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC))) { + /* We do not support data journalling for encrypted data */ + if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode)) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + } if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ - else - BUG(); + BUG(); } static inline int ext4_should_journal_data(struct inode *inode) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 551353b1b17a..61d5bfc7318c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -376,9 +376,13 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); - ext4_lblk_t last = lblock + len - 1; - if (len == 0 || lblock > last) + /* + * We allow neither: + * - zero length + * - overflow/wrap-around + */ + if (lblock + len <= lblock) return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -469,6 +473,10 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid extent entries"; goto corrupted; } + if (unlikely(depth > 32)) { + error_msg = "too large eh_depth"; + goto corrupted; + } /* Verify checksum on non-root extent tree nodes */ if (ext_depth(inode) != depth && !ext4_extent_block_csum_verify(inode, eh)) { @@ -4685,10 +4693,6 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - /* * credits to insert 1 extent into extent tree */ @@ -4752,8 +4756,6 @@ retry: goto retry; } - ext4_inode_resume_unlocked_dio(inode); - return ret > 0 ? ret2 : ret; } @@ -4770,7 +4772,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, int partial_begin, partial_end; loff_t start, end; ext4_lblk_t lblk; - struct address_space *mapping = inode->i_mapping; unsigned int blkbits = inode->i_blkbits; trace_ext4_zero_range(inode, offset, len, mode); @@ -4786,17 +4787,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, } /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + len - 1); - if (ret) - return ret; - } - - /* * Round up offset. This is not fallocate, we neet to zero out * blocks, so convert interior block aligned part of the range to * unwritten and possibly manually zero out unaligned parts of the @@ -4839,6 +4829,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, @@ -4847,7 +4841,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, round_down(offset, 1 << blkbits)) >> blkbits, new_size, flags, mode); if (ret) - goto out_mutex; + goto out_dio; } @@ -4856,16 +4850,23 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); - /* Now release the pages and zero block aligned part of pages*/ + /* + * Prevent page faults from reinstantiating pages we have + * released from page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + ret = ext4_update_disksize_before_punch(inode, offset, len); + if (ret) { + up_write(&EXT4_I(inode)->i_mmap_sem); + goto out_dio; + } + /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); + up_write(&EXT4_I(inode)->i_mmap_sem); if (ret) goto out_dio; } @@ -4901,6 +4902,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); @@ -4998,8 +5001,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto out; } + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); + ext4_inode_resume_unlocked_dio(inode); if (ret) goto out; @@ -5356,7 +5364,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ext4_lblk_t stop, *iterator, ex_start, ex_end; /* Let path point to the last extent */ - path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, + EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); @@ -5365,15 +5374,15 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, if (!extent) goto out; - stop = le32_to_cpu(extent->ee_block) + - ext4_ext_get_actual_len(extent); + stop = le32_to_cpu(extent->ee_block); /* * In case of left shift, Don't start shifting extents until we make * sure the hole is big enough to accommodate the shift. */ if (SHIFT == SHIFT_LEFT) { - path = ext4_find_extent(inode, start - 1, &path, 0); + path = ext4_find_extent(inode, start - 1, &path, + EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; @@ -5405,9 +5414,14 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, else iterator = &stop; - /* Its safe to start updating extents */ - while (start < stop) { - path = ext4_find_extent(inode, *iterator, &path, 0); + /* + * Its safe to start updating extents. Start and stop are unsigned, so + * in case of right shift if extent with 0 block is reached, iterator + * becomes NULL to indicate the end of the loop. + */ + while (iterator && start <= stop) { + path = ext4_find_extent(inode, *iterator, &path, + EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; @@ -5434,8 +5448,11 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ext4_ext_get_actual_len(extent); } else { extent = EXT_FIRST_EXTENT(path[depth].p_hdr); - *iterator = le32_to_cpu(extent->ee_block) > 0 ? - le32_to_cpu(extent->ee_block) - 1 : 0; + if (le32_to_cpu(extent->ee_block) > 0) + *iterator = le32_to_cpu(extent->ee_block) - 1; + else + /* Beginning is reached, end of the loop */ + iterator = NULL; /* Update path extent in case we need to stop */ while (le32_to_cpu(extent->ee_block) < start) extent++; @@ -5494,21 +5511,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; } - /* - * Need to round down offset to be aligned with page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - return ret; - - /* Take mutex lock */ mutex_lock(&inode->i_mutex); - /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation @@ -5524,17 +5527,43 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache(inode, ioffset); - /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* + * Write tail of the last page before removed range since it will get + * removed from the page cache below. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); + if (ret) + goto out_mmap; + /* + * Write data that will be shifted to preserve them when discarding + * page cache below. We are also protected from pages becoming dirty + * by i_mmap_sem. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, + LLONG_MAX); + if (ret) + goto out_mmap; + truncate_pagecache(inode, ioffset); + credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_dio; + goto out_mmap; } down_write(&EXT4_I(inode)->i_data_sem); @@ -5570,10 +5599,12 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_handle_sync(handle); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); -out_dio: +out_mmap: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: mutex_unlock(&inode->i_mutex); @@ -5627,21 +5658,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) return ret; } - /* - * Need to round down to align start offset to page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - return ret; - - /* Take mutex lock */ mutex_lock(&inode->i_mutex); - /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; @@ -5660,17 +5677,32 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache(inode, ioffset); - /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + /* + * Need to round down to align start offset to page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + goto out_mmap; + truncate_pagecache(inode, ioffset); + credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_dio; + goto out_mmap; } /* Expand file to avoid data loss if there is error while shifting */ @@ -5718,6 +5750,9 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } + } else { + ext4_ext_drop_refs(path); + kfree(path); } ret = ext4_es_remove_extent(inode, offset_lblk, @@ -5738,10 +5773,13 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); -out_dio: +out_mmap: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: mutex_unlock(&inode->i_mutex); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 113837e7ba98..45ef9975caec 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -209,15 +209,18 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int result; handle_t *handle = NULL; - struct super_block *sb = file_inode(vma->vm_file)->i_sb; + struct inode *inode = file_inode(vma->vm_file); + struct super_block *sb = inode->i_sb; bool write = vmf->flags & FAULT_FLAG_WRITE; if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); - } + } else + down_read(&EXT4_I(inode)->i_mmap_sem); if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; @@ -228,8 +231,10 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (write) { if (!IS_ERR(handle)) ext4_journal_stop(handle); + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); - } + } else + up_read(&EXT4_I(inode)->i_mmap_sem); return result; } @@ -246,10 +251,12 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, ext4_chunk_trans_blocks(inode, PMD_SIZE / PAGE_SIZE)); - } + } else + down_read(&EXT4_I(inode)->i_mmap_sem); if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; @@ -260,30 +267,71 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, if (write) { if (!IS_ERR(handle)) ext4_journal_stop(handle); + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); - } + } else + up_read(&EXT4_I(inode)->i_mmap_sem); return result; } static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext4_get_block_dax, - ext4_end_io_unwritten); + int err; + struct inode *inode = file_inode(vma->vm_file); + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); + err = __dax_mkwrite(vma, vmf, ext4_get_block_dax, + ext4_end_io_unwritten); + up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(inode->i_sb); + + return err; +} + +/* + * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite() + * handler we check for races agaist truncate. Note that since we cycle through + * i_mmap_sem, we are sure that also any hole punching that began before we + * were called is finished by now and so if it included part of the file we + * are working on, our pte will get unmapped and the check for pte_same() in + * wp_pfn_shared() fails. Thus fault gets retried and things work out as + * desired. + */ +static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + struct super_block *sb = inode->i_sb; + int ret = VM_FAULT_NOPAGE; + loff_t size; + + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + ret = VM_FAULT_SIGBUS; + up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(sb); + + return ret; } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .pmd_fault = ext4_dax_pmd_fault, .page_mkwrite = ext4_dax_mkwrite, - .pfn_mkwrite = dax_pfn_mkwrite, + .pfn_mkwrite = ext4_dax_pfn_mkwrite, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops #endif static const struct vm_operations_struct ext4_file_vm_ops = { - .fault = filemap_fault, + .fault = ext4_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; @@ -415,47 +463,27 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, (pgoff_t)num); - if (nr_pages == 0) { - if (whence == SEEK_DATA) - break; - - BUG_ON(whence != SEEK_HOLE); - /* - * If this is the first time to go into the loop and - * offset is not beyond the end offset, it will be a - * hole at this offset - */ - if (lastoff == startoff || lastoff < endoff) - found = 1; - break; - } - - /* - * If this is the first time to go into the loop and - * offset is smaller than the first page offset, it will be a - * hole at this offset. - */ - if (lastoff == startoff && whence == SEEK_HOLE && - lastoff < page_offset(pvec.pages[0])) { - found = 1; + if (nr_pages == 0) break; - } for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; struct buffer_head *bh, *head; /* - * If the current offset is not beyond the end of given - * range, it will be a hole. + * If current offset is smaller than the page offset, + * there is a hole at this offset. */ - if (lastoff < endoff && whence == SEEK_HOLE && - page->index > end) { + if (whence == SEEK_HOLE && lastoff < endoff && + lastoff < page_offset(pvec.pages[i])) { found = 1; *offset = lastoff; goto out; } + if (page->index > end) + goto out; + lock_page(page); if (unlikely(page->mapping != inode->i_mapping)) { @@ -472,6 +500,8 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, lastoff = page_offset(page); bh = head = page_buffers(page); do { + if (lastoff + bh->b_size <= startoff) + goto next; if (buffer_uptodate(bh) || buffer_unwritten(bh)) { if (whence == SEEK_DATA) @@ -486,6 +516,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, unlock_page(page); goto out; } +next: lastoff += bh->b_size; bh = bh->b_this_page; } while (bh != head); @@ -495,20 +526,18 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, unlock_page(page); } - /* - * The no. of pages is less than our desired, that would be a - * hole in there. - */ - if (nr_pages < num && whence == SEEK_HOLE) { - found = 1; - *offset = lastoff; + /* The no. of pages is less than our desired, we are done. */ + if (nr_pages < num) break; - } index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); + if (whence == SEEK_HOLE && lastoff < endoff) { + found = 1; + *offset = lastoff; + } out: pagevec_release(&pvec); return found; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 53f2b98a69f3..5388207d2832 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1143,25 +1143,20 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); ext4_group_t block_group; int bit; - struct buffer_head *bitmap_bh; + struct buffer_head *bitmap_bh = NULL; struct inode *inode = NULL; - long err = -EIO; + int err = -EFSCORRUPTED; - /* Error cases - e2fsck has already cleaned up for us */ - if (ino > max_ino) { - ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino); - err = -EFSCORRUPTED; - goto error; - } + if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) + goto bad_orphan; block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - ext4_warning(sb, "inode bitmap error %ld for orphan %lu", - ino, err); - goto error; + ext4_error(sb, "inode bitmap error %ld for orphan %lu", + ino, PTR_ERR(bitmap_bh)); + return (struct inode *) bitmap_bh; } /* Having the inode bit set should be a 100% indicator that this @@ -1172,15 +1167,21 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) goto bad_orphan; inode = ext4_iget(sb, ino); - if (IS_ERR(inode)) - goto iget_failed; + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ext4_error(sb, "couldn't read orphan inode %lu (err %d)", + ino, err); + return inode; + } /* - * If the orphans has i_nlinks > 0 then it should be able to be - * truncated, otherwise it won't be removed from the orphan list - * during processing and an infinite loop will result. + * If the orphans has i_nlinks > 0 then it should be able to + * be truncated, otherwise it won't be removed from the orphan + * list during processing and an infinite loop will result. + * Similarly, it must not be a bad inode. */ - if (inode->i_nlink && !ext4_can_truncate(inode)) + if ((inode->i_nlink && !ext4_can_truncate(inode)) || + is_bad_inode(inode)) goto bad_orphan; if (NEXT_ORPHAN(inode) > max_ino) @@ -1188,29 +1189,25 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) brelse(bitmap_bh); return inode; -iget_failed: - err = PTR_ERR(inode); - inode = NULL; bad_orphan: - ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); - printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n", - bit, (unsigned long long)bitmap_bh->b_blocknr, - ext4_test_bit(bit, bitmap_bh->b_data)); - printk(KERN_WARNING "inode=%p\n", inode); + ext4_error(sb, "bad orphan inode %lu", ino); + if (bitmap_bh) + printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n", + bit, (unsigned long long)bitmap_bh->b_blocknr, + ext4_test_bit(bit, bitmap_bh->b_data)); if (inode) { - printk(KERN_WARNING "is_bad_inode(inode)=%d\n", + printk(KERN_ERR "is_bad_inode(inode)=%d\n", is_bad_inode(inode)); - printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n", + printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); - printk(KERN_WARNING "max_ino=%lu\n", max_ino); - printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink); + printk(KERN_ERR "max_ino=%lu\n", max_ino); + printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; iput(inode); } brelse(bitmap_bh); -error: return ERR_PTR(err); } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d884989cc83d..dad8e7bdf0a6 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -336,8 +336,10 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, len -= EXT4_MIN_INLINE_DATA_SIZE; value = kzalloc(len, GFP_NOFS); - if (!value) + if (!value) { + error = -ENOMEM; goto out; + } error = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, len); @@ -931,8 +933,15 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, struct page *page) { int i_size_changed = 0; + int ret; - copied = ext4_write_inline_data_end(inode, pos, len, copied, page); + ret = ext4_write_inline_data_end(inode, pos, len, copied, page); + if (ret < 0) { + unlock_page(page); + put_page(page); + return ret; + } + copied = ret; /* * No need to use i_size_read() here, the i_size @@ -1149,10 +1158,9 @@ static int ext4_finish_convert_inline_dir(handle_t *handle, set_buffer_uptodate(dir_block); err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); if (err) - goto out; + return err; set_buffer_verified(dir_block); -out: - return err; + return ext4_mark_inode_dirty(handle, inode); } static int ext4_convert_inline_data_nolock(handle_t *handle, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 06bda0361e7c..1796d1bd9a1d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -51,25 +51,30 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - __u16 csum_lo; - __u16 csum_hi = 0; __u32 csum; + __u16 dummy_csum = 0; + int offset = offsetof(struct ext4_inode, i_checksum_lo); + unsigned int csum_size = sizeof(dummy_csum); - csum_lo = le16_to_cpu(raw->i_checksum_lo); - raw->i_checksum_lo = 0; - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && - EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum_hi = le16_to_cpu(raw->i_checksum_hi); - raw->i_checksum_hi = 0; - } - - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, - EXT4_INODE_SIZE(inode->i_sb)); + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); + offset += csum_size; + csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + EXT4_GOOD_OLD_INODE_SIZE - offset); - raw->i_checksum_lo = cpu_to_le16(csum_lo); - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && - EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) - raw->i_checksum_hi = cpu_to_le16(csum_hi); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + offset = offsetof(struct ext4_inode, i_checksum_hi); + csum = ext4_chksum(sbi, csum, (__u8 *)raw + + EXT4_GOOD_OLD_INODE_SIZE, + offset - EXT4_GOOD_OLD_INODE_SIZE); + if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, + csum_size); + offset += csum_size; + } + csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + EXT4_INODE_SIZE(inode->i_sb) - offset); + } return csum; } @@ -205,9 +210,9 @@ void ext4_evict_inode(struct inode *inode) * Note that directories do not have this problem because they * don't use page cache. */ - if (ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && - inode->i_ino != EXT4_JOURNAL_INO) { + if (inode->i_ino != EXT4_JOURNAL_INO && + ext4_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; @@ -1159,8 +1164,11 @@ static int ext4_write_end(struct file *file, if (ext4_has_inline_data(inode)) { ret = ext4_write_inline_data_end(inode, pos, len, copied, page); - if (ret < 0) + if (ret < 0) { + unlock_page(page); + put_page(page); goto errout; + } copied = ret; } else copied = block_write_end(file, mapping, pos, @@ -1214,7 +1222,9 @@ errout: * set the buffer to be dirty, since in data=journalled mode we need * to call ext4_handle_dirty_metadata() instead. */ -static void zero_new_buffers(struct page *page, unsigned from, unsigned to) +static void ext4_journalled_zero_new_buffers(handle_t *handle, + struct page *page, + unsigned from, unsigned to) { unsigned int block_start = 0, block_end; struct buffer_head *head, *bh; @@ -1231,7 +1241,7 @@ static void zero_new_buffers(struct page *page, unsigned from, unsigned to) size = min(to, block_end) - start; zero_user(page, start, size); - set_buffer_uptodate(bh); + write_end_fn(handle, bh); } clear_buffer_new(bh); } @@ -1260,18 +1270,25 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); - if (ext4_has_inline_data(inode)) - copied = ext4_write_inline_data_end(inode, pos, len, - copied, page); - else { - if (copied < len) { - if (!PageUptodate(page)) - copied = 0; - zero_new_buffers(page, from+copied, to); + if (ext4_has_inline_data(inode)) { + ret = ext4_write_inline_data_end(inode, pos, len, + copied, page); + if (ret < 0) { + unlock_page(page); + put_page(page); + goto errout; } - + copied = ret; + } else if (unlikely(copied < len) && !PageUptodate(page)) { + copied = 0; + ext4_journalled_zero_new_buffers(handle, page, from, to); + } else { + if (unlikely(copied < len)) + ext4_journalled_zero_new_buffers(handle, page, + from + copied, to); ret = ext4_walk_page_buffers(handle, page_buffers(page), from, - to, &partial, write_end_fn); + from + copied, &partial, + write_end_fn); if (!partial) SetPageUptodate(page); } @@ -1297,6 +1314,7 @@ static int ext4_journalled_write_end(struct file *file, */ ext4_orphan_add(handle, inode); +errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -2026,7 +2044,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, { struct inode *inode = mpd->inode; int err; - ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) + ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) >> inode->i_blkbits; do { @@ -2589,13 +2607,36 @@ retry: done = true; } } - ext4_journal_stop(handle); + /* + * Caution: If the handle is synchronous, + * ext4_journal_stop() can wait for transaction commit + * to finish which may depend on writeback of pages to + * complete or on page lock to be released. In that + * case, we have to wait until after after we have + * submitted all the IO, released page locks we hold, + * and dropped io_end reference (for extent conversion + * to be able to complete) before stopping the handle. + */ + if (!ext4_handle_valid(handle) || handle->h_sync == 0) { + ext4_journal_stop(handle); + handle = NULL; + } /* Submit prepared bio */ ext4_io_submit(&mpd.io_submit); /* Unlock pages we didn't use */ mpage_release_unused_pages(&mpd, give_up_on_write); - /* Drop our io_end reference we got from init */ - ext4_put_io_end(mpd.io_submit.io_end); + /* + * Drop our io_end reference we got from init. We have + * to be careful and use deferred io_end finishing if + * we are still holding the transaction as we can + * release the last reference to io_end which may end + * up doing unwritten extent conversion. + */ + if (handle) { + ext4_put_io_end_defer(mpd.io_submit.io_end); + ext4_journal_stop(handle); + } else + ext4_put_io_end(mpd.io_submit.io_end); if (ret == -ENOSPC && sbi->s_journal) { /* @@ -3531,6 +3572,10 @@ static int ext4_block_truncate_page(handle_t *handle, unsigned blocksize; struct inode *inode = mapping->host; + /* If we are processing an encrypted inode during orphan list handling */ + if (ext4_encrypted_inode(inode) && !ext4_has_encryption_key(inode)) + return 0; + blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); @@ -3587,7 +3632,36 @@ int ext4_can_truncate(struct inode *inode) } /* - * ext4_punch_hole: punches a hole in a file by releaseing the blocks + * We have to make sure i_disksize gets properly updated before we truncate + * page cache due to hole punching or zero range. Otherwise i_disksize update + * can get lost as it may have been postponed to submission of writeback but + * that will never happen after we truncate page cache. + */ +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len) +{ + handle_t *handle; + loff_t size = i_size_read(inode); + + WARN_ON(!mutex_is_locked(&inode->i_mutex)); + if (offset > size || offset + len < size) + return 0; + + if (EXT4_I(inode)->i_disksize >= size) + return 0; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_update_i_disksize(inode, size); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + + return 0; +} + +/* + * ext4_punch_hole: punches a hole in a file by releasing the blocks * associated with the given offset and length * * @inode: File inode @@ -3616,7 +3690,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) * Write out all dirty pages to avoid race conditions * Then release them. */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { ret = filemap_write_and_wait_range(mapping, offset, offset + length - 1); if (ret) @@ -3651,17 +3725,26 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; /* Now release the pages and zero block aligned part of pages*/ - if (last_block_offset > first_block_offset) + if (last_block_offset > first_block_offset) { + ret = ext4_update_disksize_before_punch(inode, offset, length); + if (ret) + goto out_dio; truncate_pagecache_range(inode, first_block_offset, last_block_offset); - - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); + } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); @@ -3708,16 +3791,14 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (IS_SYNC(inode)) ext4_handle_sync(handle); - /* Now release the pages again to reduce race window */ - if (last_block_offset > first_block_offset) - truncate_pagecache_range(inode, first_block_offset, - last_block_offset); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_dio: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: mutex_unlock(&inode->i_mutex); @@ -4112,6 +4193,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct inode *inode; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; + loff_t size; int block; uid_t i_uid; gid_t i_gid; @@ -4203,6 +4285,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(raw_inode); + if ((size = i_size_read(inode)) < 0) { + EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); + ret = -EFSCORRUPTED; + goto bad_inode; + } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; @@ -4486,14 +4573,14 @@ static int ext4_do_update_inode(handle_t *handle, * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ - if (!ei->i_dtime) { + if (ei->i_dtime && list_empty(&ei->i_orphan)) { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } else { raw_inode->i_uid_high = cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = cpu_to_le16(high_16_bits(i_gid)); - } else { - raw_inode->i_uid_high = 0; - raw_inode->i_gid_high = 0; } } else { raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); @@ -4851,6 +4938,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } else ext4_wait_for_tail_page_commit(inode); } + down_write(&EXT4_I(inode)->i_mmap_sem); /* * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. @@ -4858,6 +4946,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) truncate_pagecache(inode, inode->i_size); if (shrink) ext4_truncate(inode); + up_write(&EXT4_I(inode)->i_mmap_sem); } if (!rc) { @@ -5075,8 +5164,9 @@ static int ext4_expand_extra_isize(struct inode *inode, /* No extended attributes present */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { - memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, - new_extra_isize); + memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize, 0, + new_extra_isize - EXT4_I(inode)->i_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; return 0; } @@ -5109,6 +5199,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) might_sleep(); trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + return err; if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { @@ -5125,8 +5217,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) sbi->s_want_extra_isize, iloc, handle); if (ret) { - ext4_set_inode_state(inode, - EXT4_STATE_NO_EXPAND); if (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count)) { ext4_warning(inode->i_sb, @@ -5139,9 +5229,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) } } } - if (!err) - err = ext4_mark_iloc_dirty(handle, inode, &iloc); - return err; + return ext4_mark_iloc_dirty(handle, inode, &iloc); } /* @@ -5306,6 +5394,13 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); + + down_read(&EXT4_I(inode)->i_mmap_sem); + + ret = ext4_convert_inline_data(inode); + if (ret) + goto out_ret; + /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && @@ -5375,6 +5470,19 @@ retry_alloc: out_ret: ret = block_page_mkwrite_return(ret); out: + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(inode->i_sb); return ret; } + +int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int err; + + down_read(&EXT4_I(inode)->i_mmap_sem); + err = filemap_fault(vma, vmf); + up_read(&EXT4_I(inode)->i_mmap_sem); + + return err; +} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5e872fd40e5e..bcd7c4788903 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -622,6 +622,9 @@ resizefs_out: struct ext4_encryption_policy policy; int err = 0; + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + if (copy_from_user(&policy, (struct ext4_encryption_policy __user *)arg, sizeof(policy))) { @@ -629,7 +632,17 @@ resizefs_out: goto encryption_policy_out; } + err = mnt_want_write_file(filp); + if (err) + goto encryption_policy_out; + + mutex_lock(&inode->i_mutex); + err = ext4_process_policy(&policy, inode); + + mutex_unlock(&inode->i_mutex); + + mnt_drop_write_file(filp); encryption_policy_out: return err; #else diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 61eaf74dca37..84cd77663e1f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -669,7 +669,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, ext4_grpblk_t min; ext4_grpblk_t max; ext4_grpblk_t chunk; - unsigned short border; + unsigned int border; BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); @@ -815,7 +815,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b) * for this page; do not hold this lock when calling this routine! */ -static int ext4_mb_init_cache(struct page *page, char *incore) +static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) { ext4_group_t ngroups; int blocksize; @@ -848,7 +848,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* allocate buffer_heads to read bitmaps */ if (groups_per_page > 1) { i = sizeof(struct buffer_head *) * groups_per_page; - bh = kzalloc(i, GFP_NOFS); + bh = kzalloc(i, gfp); if (bh == NULL) { err = -ENOMEM; goto out; @@ -983,7 +983,7 @@ out: * are on the same page e4b->bd_buddy_page is NULL and return value is 0. */ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, - ext4_group_t group, struct ext4_buddy *e4b) + ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum, poff; @@ -1002,7 +1002,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, pnum, gfp); if (!page) return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); @@ -1016,7 +1016,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, block++; pnum = block / blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, pnum, gfp); if (!page) return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); @@ -1042,7 +1042,7 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) * calling this routine! */ static noinline_for_stack -int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) { struct ext4_group_info *this_grp; @@ -1062,7 +1062,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) * The call to ext4_mb_get_buddy_page_lock will mark the * page accessed. */ - ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); + ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group @@ -1072,7 +1072,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) } page = e4b.bd_bitmap_page; - ret = ext4_mb_init_cache(page, NULL); + ret = ext4_mb_init_cache(page, NULL, gfp); if (ret) goto err; if (!PageUptodate(page)) { @@ -1091,7 +1091,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) } /* init buddy cache */ page = e4b.bd_buddy_page; - ret = ext4_mb_init_cache(page, e4b.bd_bitmap); + ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); if (ret) goto err; if (!PageUptodate(page)) { @@ -1109,8 +1109,8 @@ err: * calling this routine! */ static noinline_for_stack int -ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, - struct ext4_buddy *e4b) +ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b, gfp_t gfp) { int blocks_per_page; int block; @@ -1140,7 +1140,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, * we need full data about the group * to make a good selection */ - ret = ext4_mb_init_group(sb, group); + ret = ext4_mb_init_group(sb, group, gfp); if (ret) return ret; } @@ -1168,11 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, * wait for it to initialize. */ page_cache_release(page); - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { BUG_ON(page->mapping != inode->i_mapping); if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, NULL); + ret = ext4_mb_init_cache(page, NULL, gfp); if (ret) { unlock_page(page); goto err; @@ -1204,11 +1204,12 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, if (page == NULL || !PageUptodate(page)) { if (page) page_cache_release(page); - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { BUG_ON(page->mapping != inode->i_mapping); if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, e4b->bd_bitmap); + ret = ext4_mb_init_cache(page, e4b->bd_bitmap, + gfp); if (ret) { unlock_page(page); goto err; @@ -1247,6 +1248,12 @@ err: return ret; } +static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b) +{ + return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); +} + static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_page) @@ -1259,6 +1266,7 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) { int order = 1; + int bb_incr = 1 << (e4b->bd_blkbits - 1); void *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); @@ -1271,7 +1279,8 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) /* this block is part of buddy of order 'order' */ return order; } - bb += 1 << (e4b->bd_blkbits - order); + bb += bb_incr; + bb_incr >>= 1; order++; } return 0; @@ -2045,7 +2054,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - int ret = ext4_mb_init_group(ac->ac_sb, group); + int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); if (ret) return ret; } @@ -2278,7 +2287,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) struct ext4_group_info *grinfo; struct sg { struct ext4_group_info info; - ext4_grpblk_t counters[16]; + ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; } sg; group--; @@ -2576,7 +2585,7 @@ int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned i, j; - unsigned offset; + unsigned offset, offset_incr; unsigned max; int ret; @@ -2605,11 +2614,13 @@ int ext4_mb_init(struct super_block *sb) i = 1; offset = 0; + offset_incr = 1 << (sb->s_blocksize_bits - 1); max = sb->s_blocksize << 2; do { sbi->s_mb_offsets[i] = offset; sbi->s_mb_maxs[i] = max; - offset += 1 << (sb->s_blocksize_bits - i); + offset += offset_incr; + offset_incr = offset_incr >> 1; max = max >> 1; i++; } while (i <= sb->s_blocksize_bits + 1); @@ -2928,7 +2939,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata", block, block+len); /* File system mounted not to panic on error - * Fix the bitmap and repeat the block allocation + * Fix the bitmap and return EFSCORRUPTED * We leak some of the blocks here. */ ext4_lock_group(sb, ac->ac_b_ex.fe_group); @@ -2937,7 +2948,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_unlock_group(sb, ac->ac_b_ex.fe_group); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) - err = -EAGAIN; + err = -EFSCORRUPTED; goto out_err; } @@ -3109,6 +3120,13 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, if (ar->pright && start + size - 1 >= ar->lright) size -= start + size - ar->lright; + /* + * Trim allocation request for filesystems with artificially small + * groups. + */ + if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb); + end = start + size; /* check we don't cross already preallocated blocks */ @@ -4502,18 +4520,7 @@ repeat: } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); - if (*errp == -EAGAIN) { - /* - * drop the reference that we took - * in ext4_mb_use_best_found - */ - ext4_mb_release_context(ac); - ac->ac_b_ex.fe_group = 0; - ac->ac_b_ex.fe_start = 0; - ac->ac_b_ex.fe_len = 0; - ac->ac_status = AC_STATUS_CONTINUE; - goto repeat; - } else if (*errp) { + if (*errp) { ext4_discard_allocated_blocks(ac); goto errout; } else { @@ -4815,7 +4822,9 @@ do_more: #endif trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); - err = ext4_mb_load_buddy(sb, block_group, &e4b); + /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ + err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, + GFP_NOFS|__GFP_NOFAIL); if (err) goto error_return; @@ -5217,7 +5226,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) grp = ext4_get_group_info(sb, group); /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - ret = ext4_mb_init_group(sb, group); + ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) break; } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index e032a0423e35..05048fcfd602 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -60,10 +60,10 @@ ext4_double_down_write_data_sem(struct inode *first, struct inode *second) { if (first < second) { down_write(&EXT4_I(first)->i_data_sem); - down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); + down_write_nested(&EXT4_I(second)->i_data_sem, I_DATA_SEM_OTHER); } else { down_write(&EXT4_I(second)->i_data_sem); - down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING); + down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER); } } @@ -187,7 +187,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) if (PageUptodate(page)) return 0; - blocksize = 1 << inode->i_blkbits; + blocksize = i_blocksize(inode); if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); @@ -390,6 +390,7 @@ data_copy: *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); if (*err < 0) break; + bh = bh->b_this_page; } if (!*err) *err = block_commit_write(pagep[0], from, from + replaced_size); @@ -483,6 +484,13 @@ mext_check_arguments(struct inode *orig_inode, return -EBUSY; } + if (IS_NOQUOTA(orig_inode) || IS_NOQUOTA(donor_inode)) { + ext4_debug("ext4 move extent: The argument files should " + "not be quota files [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EBUSY; + } + /* Ext4 move extent supports only extent based file */ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: orig file is not extents " @@ -590,6 +598,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, return -EOPNOTSUPP; } + if (ext4_encrypted_inode(orig_inode) || + ext4_encrypted_inode(donor_inode)) { + ext4_msg(orig_inode->i_sb, KERN_ERR, + "Online defrag not supported for encrypted files"); + return -EOPNOTSUPP; + } + /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a969ab39f302..1d007e853f5c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -420,15 +420,14 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; - __le32 save_csum; int size; + __u32 dummy_csum = 0; + int offset = offsetof(struct dx_tail, dt_checksum); size = count_offset + (count * sizeof(struct dx_entry)); - save_csum = t->dt_checksum; - t->dt_checksum = 0; csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); - csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); - t->dt_checksum = save_csum; + csum = ext4_chksum(sbi, csum, (__u8 *)t, offset); + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); return cpu_to_le32(csum); } @@ -1244,9 +1243,9 @@ static inline int ext4_match(struct ext4_filename *fname, if (unlikely(!name)) { if (fname->usr_fname->name[0] == '_') { int ret; - if (de->name_len < 16) + if (de->name_len <= 32) return 0; - ret = memcmp(de->name + de->name_len - 16, + ret = memcmp(de->name + ((de->name_len - 17) & ~15), fname->crypto_buf.name + 8, 16); return (ret == 0) ? 1 : 0; } @@ -1558,6 +1557,24 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi struct ext4_dir_entry_2 *de; struct buffer_head *bh; + if (ext4_encrypted_inode(dir)) { + int res = ext4_get_encryption_info(dir); + + /* + * This should be a properly defined flag for + * dentry->d_flags when we uplift this to the VFS. + * d_fsdata is set to (void *) 1 if if the dentry is + * created while the directory was encrypted and we + * don't have access to the key. + */ + dentry->d_fsdata = NULL; + if (ext4_encryption_info(dir)) + dentry->d_fsdata = (void *) 1; + d_set_d_op(dentry, &ext4_encrypted_d_ops); + if (res && res != -ENOKEY) + return ERR_PTR(res); + } + if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -2018,33 +2035,31 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, frame->entries = entries; frame->at = entries; frame->bh = bh; - bh = bh2; retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (retval) goto out_frames; - retval = ext4_handle_dirty_dirent_node(handle, dir, bh); + retval = ext4_handle_dirty_dirent_node(handle, dir, bh2); if (retval) goto out_frames; - de = do_split(handle,dir, &bh, frame, &fname->hinfo); + de = do_split(handle,dir, &bh2, frame, &fname->hinfo); if (IS_ERR(de)) { retval = PTR_ERR(de); goto out_frames; } - dx_release(frames); - retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh); - brelse(bh); - return retval; + retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2); out_frames: /* * Even if the block split failed, we have to properly write * out all the changes we did so far. Otherwise we can end up * with corrupted filesystem. */ - ext4_mark_inode_dirty(handle, dir); + if (retval) + ext4_mark_inode_dirty(handle, dir); dx_release(frames); + brelse(bh2); return retval; } @@ -2809,7 +2824,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) * list entries can cause panics at unmount time. */ mutex_lock(&sbi->s_orphan_lock); - list_del(&EXT4_I(inode)->i_orphan); + list_del_init(&EXT4_I(inode)->i_orphan); mutex_unlock(&sbi->s_orphan_lock); } } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 17fbe3882b8e..6ca56f5f72b5 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -23,6 +23,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mm.h> +#include <linux/backing-dev.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -485,9 +486,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io, if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) && nr_to_submit) { - data_page = ext4_encrypt(inode, page); + gfp_t gfp_flags = GFP_NOFS; + + retry_encrypt: + data_page = ext4_encrypt(inode, page, gfp_flags); if (IS_ERR(data_page)) { ret = PTR_ERR(data_page); + if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { + if (io->io_bio) { + ext4_io_submit(io); + congestion_wait(BLK_RW_ASYNC, HZ/50); + } + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } data_page = NULL; goto out; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 5dc5e95063de..bc7642f57dc8 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -279,7 +279,7 @@ int ext4_mpage_readpages(struct address_space *mapping, if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - ctx = ext4_get_crypto_ctx(inode); + ctx = ext4_get_crypto_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) goto set_error_page; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 34038e3598d5..74516efd874c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1926,7 +1926,8 @@ retry: n_desc_blocks = o_desc_blocks + le16_to_cpu(es->s_reserved_gdt_blocks); n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); - n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb); + n_blocks_count = (ext4_fsblk_t)n_group * + EXT4_BLOCKS_PER_GROUP(sb); n_group--; /* set to last group number */ } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c9ab67da6e5a..68345a9e59b8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -793,6 +793,7 @@ static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; + int aborted = 0; int i, err; ext4_unregister_li_request(sb); @@ -802,9 +803,10 @@ static void ext4_put_super(struct super_block *sb) destroy_workqueue(sbi->rsv_conversion_wq); if (sbi->s_journal) { + aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; - if (err < 0) + if ((err < 0) && !aborted) ext4_abort(sb, "Couldn't clean up the journal"); } @@ -816,7 +818,7 @@ static void ext4_put_super(struct super_block *sb) ext4_ext_release(sb); ext4_xattr_put_super(sb); - if (!(sb->s_flags & MS_RDONLY)) { + if (!(sb->s_flags & MS_RDONLY) && !aborted) { ext4_clear_feature_journal_needs_recovery(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } @@ -958,6 +960,7 @@ static void init_once(void *foo) INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); + init_rwsem(&ei->i_mmap_sem); inode_init_once(&ei->vfs_inode); } @@ -1292,9 +1295,9 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) return -1; } if (ext4_has_feature_quota(sb)) { - ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options " - "when QUOTA feature is enabled"); - return -1; + ext4_msg(sb, KERN_INFO, "Journaled quota options " + "ignored when QUOTA feature is enabled"); + return 1; } qname = match_strdup(args); if (!qname) { @@ -1657,10 +1660,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; } if (ext4_has_feature_quota(sb)) { - ext4_msg(sb, KERN_ERR, - "Cannot set journaled quota options " + ext4_msg(sb, KERN_INFO, + "Quota format mount options ignored " "when QUOTA feature is enabled"); - return -1; + return 1; } sbi->s_jquota_fmt = m->mount_opt; #endif @@ -1721,11 +1724,11 @@ static int parse_options(char *options, struct super_block *sb, #ifdef CONFIG_QUOTA if (ext4_has_feature_quota(sb) && (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) { - ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA " - "feature is enabled"); - return 0; - } - if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + ext4_msg(sb, KERN_INFO, "Quota feature enabled, usrquota and grpquota " + "mount options ignored."); + clear_opt(sb, USRQUOTA); + clear_opt(sb, GRPQUOTA); + } else if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sb, USRQUOTA); @@ -2029,23 +2032,25 @@ failed: static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { - int offset; + int offset = offsetof(struct ext4_group_desc, bg_checksum); __u16 crc = 0; __le32 le_group = cpu_to_le32(block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ - __le16 save_csum; __u32 csum32; + __u16 dummy_csum = 0; - save_csum = gdp->bg_checksum; - gdp->bg_checksum = 0; csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, - sbi->s_desc_size); - gdp->bg_checksum = save_csum; + csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); + csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, + sizeof(dummy_csum)); + offset += sizeof(dummy_csum); + if (offset < sbi->s_desc_size) + csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, + sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; goto out; @@ -2055,8 +2060,6 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, if (!ext4_has_feature_gdt_csum(sb)) return 0; - offset = offsetof(struct ext4_group_desc, bg_checksum); - crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); crc = crc16(crc, (__u8 *)gdp, offset); @@ -2092,6 +2095,7 @@ void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, /* Called at mount-time, super-block is locked */ static int ext4_check_descriptors(struct super_block *sb, + ext4_fsblk_t sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -2122,6 +2126,11 @@ static int ext4_check_descriptors(struct super_block *sb, grp = i; block_bitmap = ext4_block_bitmap(sb, gdp); + if (block_bitmap == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Block bitmap for group %u overlaps " + "superblock", i); + } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u not in group " @@ -2129,6 +2138,11 @@ static int ext4_check_descriptors(struct super_block *sb, return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); + if (inode_bitmap == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode bitmap for group %u overlaps " + "superblock", i); + } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u not in group " @@ -2136,6 +2150,11 @@ static int ext4_check_descriptors(struct super_block *sb, return 0; } inode_table = ext4_inode_table(sb, gdp); + if (inode_table == sb_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode table for group %u overlaps " + "superblock", i); + } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " @@ -2239,6 +2258,16 @@ static void ext4_orphan_cleanup(struct super_block *sb, while (es->s_last_orphan) { struct inode *inode; + /* + * We may have encountered an error during cleanup; if + * so, skip the rest. + */ + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + es->s_last_orphan = 0; + break; + } + inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); if (IS_ERR(inode)) { es->s_last_orphan = 0; @@ -3010,10 +3039,15 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, ext4_set_bit(s++, buf); count++; } - for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) { - ext4_set_bit(EXT4_B2C(sbi, s++), buf); - count++; + j = ext4_bg_num_gdb(sb, grp); + if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { + ext4_error(sb, "Invalid number of block group " + "descriptor blocks: %d", j); + j = EXT4_BLOCKS_PER_GROUP(sb) - s; } + count += j; + for (; j > 0; j--) + ext4_set_bit(EXT4_B2C(sbi, s++), buf); } if (!count) return 0; @@ -3103,7 +3137,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) char *orig_data = kstrdup(data, GFP_KERNEL); struct buffer_head *bh; struct ext4_super_block *es = NULL; - struct ext4_sb_info *sbi; + struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); ext4_fsblk_t block; ext4_fsblk_t sb_block = get_sb_block(&data); ext4_fsblk_t logical_sb_block; @@ -3122,16 +3156,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ext4_group_t first_not_zeroed; - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) - goto out_free_orig; + if ((data && !orig_data) || !sbi) + goto out_free_base; sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); - if (!sbi->s_blockgroup_lock) { - kfree(sbi); - goto out_free_orig; - } + if (!sbi->s_blockgroup_lock) + goto out_free_base; + sb->s_fs_info = sbi; sbi->s_sb = sb; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; @@ -3277,11 +3309,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; - if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, - &journal_devnum, &journal_ioprio, 0)) { - ext4_msg(sb, KERN_WARNING, - "failed to parse options in superblock: %s", - sbi->s_es->s_mount_opts); + if (sbi->s_es->s_mount_opts[0]) { + char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, + sizeof(sbi->s_es->s_mount_opts), + GFP_KERNEL); + if (!s_mount_opts) + goto failed_mount; + if (!parse_options(s_mount_opts, sb, &journal_devnum, + &journal_ioprio, 0)) { + ext4_msg(sb, KERN_WARNING, + "failed to parse options in superblock: %s", + s_mount_opts); + } + kfree(s_mount_opts); } sbi->s_def_mount_opt = sbi->s_mount_opt; if (!parse_options((char *) data, sb, &journal_devnum, @@ -3307,6 +3347,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and dax"); goto failed_mount; } + if (ext4_has_feature_encrypt(sb)) { + ext4_msg(sb, KERN_WARNING, + "encrypted files will use data=ordered " + "instead of data journaling mode"); + } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } else { @@ -3367,7 +3412,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (blocksize < EXT4_MIN_BLOCK_SIZE || blocksize > EXT4_MAX_BLOCK_SIZE) { ext4_msg(sb, KERN_ERR, - "Unsupported filesystem blocksize %d", blocksize); + "Unsupported filesystem blocksize %d (%d log_block_size)", + blocksize, le32_to_cpu(es->s_log_block_size)); + goto failed_mount; + } + if (le32_to_cpu(es->s_log_block_size) > + (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log block size: %u", + le32_to_cpu(es->s_log_block_size)); + goto failed_mount; + } + + if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { + ext4_msg(sb, KERN_ERR, + "Number of reserved GDT blocks insanely large: %d", + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); goto failed_mount; } @@ -3454,12 +3514,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0) - goto cantfind_ext4; sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0) goto cantfind_ext4; + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || + sbi->s_inodes_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", + sbi->s_blocks_per_group); + goto failed_mount; + } sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); @@ -3499,6 +3563,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "block size (%d)", clustersize, blocksize); goto failed_mount; } + if (le32_to_cpu(es->s_log_cluster_size) > + (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log cluster size: %u", + le32_to_cpu(es->s_log_cluster_size)); + goto failed_mount; + } sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - le32_to_cpu(es->s_log_block_size); sbi->s_clusters_per_group = @@ -3535,13 +3606,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_cluster_ratio = clustersize / blocksize; - if (sbi->s_inodes_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#inodes per group too big: %lu", - sbi->s_inodes_per_group); - goto failed_mount; - } - /* Do we have standard group size of clustersize * 8 blocks ? */ if (sbi->s_blocks_per_group == clustersize << 3) set_opt2(sb, STD_GROUP_SIZE); @@ -3601,6 +3665,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); + if (ext4_has_feature_meta_bg(sb)) { + if (le32_to_cpu(es->s_first_meta_bg) > db_count) { + ext4_msg(sb, KERN_WARNING, + "first meta block group too large: %u " + "(group descriptor block count %u)", + le32_to_cpu(es->s_first_meta_bg), db_count); + goto failed_mount; + } + } sbi->s_group_desc = ext4_kvmalloc(db_count * sizeof(struct buffer_head *), GFP_KERNEL); @@ -3622,7 +3695,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } - if (!ext4_check_descriptors(sb, &first_not_zeroed)) { + if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); ret = -EFSCORRUPTED; goto failed_mount2; @@ -3675,7 +3748,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { - if (ext4_load_journal(sb, es, journal_devnum)) + err = ext4_load_journal(sb, es, journal_devnum); + if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && ext4_has_feature_journal_needs_recovery(sb)) { @@ -3945,7 +4019,9 @@ no_journal: if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, + "Opts: %.*s%s%s", descr, + (int) sizeof(sbi->s_es->s_mount_opts), + sbi->s_es->s_mount_opts, *sbi->s_es->s_mount_opts ? "; " : "", orig_data); if (es->s_error_count) @@ -4015,8 +4091,8 @@ failed_mount: out_fail: sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); +out_free_base: kfree(sbi); -out_free_orig: kfree(orig_data); return err ? err : ret; } @@ -4936,6 +5012,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type) EXT4_SB(sb)->s_jquota_fmt, type); } +static void lockdep_set_quota_inode(struct inode *inode, int subclass) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + /* The first argument of lockdep_set_subclass has to be + * *exactly* the same as the argument to init_rwsem() --- in + * this case, in init_once() --- or lockdep gets unhappy + * because the name of the lock is set using the + * stringification of the argument to init_rwsem(). + */ + (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ + lockdep_set_subclass(&ei->i_data_sem, subclass); +} + /* * Standard function to be called on quota_on */ @@ -4975,8 +5065,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, if (err) return err; } - - return dquot_quota_on(sb, type, format_id, path); + lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); + err = dquot_quota_on(sb, type, format_id, path); + if (err) + lockdep_set_quota_inode(path->dentry->d_inode, + I_DATA_SEM_NORMAL); + return err; } static int ext4_quota_enable(struct super_block *sb, int type, int format_id, @@ -5002,8 +5096,11 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, /* Don't account quota for quota files to avoid recursion */ qf_inode->i_flags |= S_NOQUOTA; + lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); err = dquot_enable(qf_inode, type, format_id, flags); iput(qf_inode); + if (err) + lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); return err; } diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 1420a3c614af..c2ee23acf359 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -100,7 +100,7 @@ static ssize_t reserved_clusters_store(struct ext4_attr *a, int ret; ret = kstrtoull(skip_spaces(buf), 0, &val); - if (!ret || val >= clusters) + if (ret || val >= clusters) return -EINVAL; atomic64_set(&sbi->s_resv_clusters, val); @@ -223,14 +223,18 @@ static struct attribute *ext4_attrs[] = { EXT4_ATTR_FEATURE(lazy_itable_init); EXT4_ATTR_FEATURE(batched_discard); EXT4_ATTR_FEATURE(meta_bg_resize); +#ifdef CONFIG_EXT4_FS_ENCRYPTION EXT4_ATTR_FEATURE(encryption); +#endif EXT4_ATTR_FEATURE(metadata_csum_seed); static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), ATTR_LIST(batched_discard), ATTR_LIST(meta_bg_resize), +#ifdef CONFIG_EXT4_FS_ENCRYPTION ATTR_LIST(encryption), +#endif ATTR_LIST(metadata_csum_seed), NULL, }; diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index 011ba6670d99..c70d06a383e2 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -10,8 +10,10 @@ */ static inline void ext4_truncate_failed_write(struct inode *inode) { + down_write(&EXT4_I(inode)->i_mmap_sem); truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); + up_write(&EXT4_I(inode)->i_mmap_sem); } /* diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 6b6b3e751f8c..7c23363ecf19 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -123,17 +123,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; - __le32 save_csum; __le64 dsk_block_nr = cpu_to_le64(block_nr); + __u32 dummy_csum = 0; + int offset = offsetof(struct ext4_xattr_header, h_checksum); - save_csum = hdr->h_checksum; - hdr->h_checksum = 0; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr, - EXT4_BLOCK_SIZE(inode->i_sb)); + csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset); + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + offset += sizeof(dummy_csum); + csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset, + EXT4_BLOCK_SIZE(inode->i_sb) - offset); - hdr->h_checksum = save_csum; return cpu_to_le32(csum); } @@ -232,6 +233,27 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) return error; } +static int +__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, + void *end, const char *function, unsigned int line) +{ + struct ext4_xattr_entry *entry = IFIRST(header); + int error = -EFSCORRUPTED; + + if (((void *) header >= end) || + (header->h_magic != le32_to_cpu(EXT4_XATTR_MAGIC))) + goto errout; + error = ext4_xattr_check_names(entry, end, entry); +errout: + if (error) + __ext4_error_inode(inode, function, line, 0, + "corrupted in-inode xattr"); + return error; +} + +#define xattr_check_inode(inode, header, end) \ + __xattr_check_inode((inode), (header), (end), __func__, __LINE__) + static inline int ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size) { @@ -343,7 +365,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, header = IHDR(inode, raw_inode); entry = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = ext4_xattr_check_names(entry, end, entry); + error = xattr_check_inode(inode, header, end); if (error) goto cleanup; error = ext4_xattr_find_entry(&entry, name_index, name, @@ -474,7 +496,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header)); + error = xattr_check_inode(inode, header, end); if (error) goto cleanup; error = ext4_xattr_list_entries(dentry, IFIRST(header), @@ -990,8 +1012,7 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { - error = ext4_xattr_check_names(IFIRST(header), is->s.end, - IFIRST(header)); + error = xattr_check_inode(inode, header, is->s.end); if (error) return error; /* Find the named attribute. */ @@ -1264,15 +1285,19 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, size_t min_offs, free; int total_ino; void *base, *start, *end; - int extra_isize = 0, error = 0, tried_min_extra_isize = 0; + int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); + int isize_diff; /* How much do we need to grow i_extra_isize */ down_write(&EXT4_I(inode)->xattr_sem); + /* + * Set EXT4_STATE_NO_EXPAND to avoid recursion when marking inode dirty + */ + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); retry: - if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) { - up_write(&EXT4_I(inode)->xattr_sem); - return 0; - } + isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) + goto out; header = IHDR(inode, raw_inode); entry = IFIRST(header); @@ -1288,8 +1313,12 @@ retry: last = entry; total_ino = sizeof(struct ext4_xattr_ibody_header); + error = xattr_check_inode(inode, header, end); + if (error) + goto cleanup; + free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); - if (free >= new_extra_isize) { + if (free >= isize_diff) { entry = IFIRST(header); ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - new_extra_isize, (void *)raw_inode + @@ -1297,8 +1326,7 @@ retry: (void *)header, total_ino, inode->i_sb->s_blocksize); EXT4_I(inode)->i_extra_isize = new_extra_isize; - error = 0; - goto cleanup; + goto out; } /* @@ -1321,7 +1349,7 @@ retry: end = bh->b_data + bh->b_size; min_offs = end - base; free = ext4_xattr_free_space(first, &min_offs, base, NULL); - if (free < new_extra_isize) { + if (free < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; @@ -1335,7 +1363,7 @@ retry: free = inode->i_sb->s_blocksize; } - while (new_extra_isize > 0) { + while (isize_diff > 0) { size_t offs, size, entry_size; struct ext4_xattr_entry *small_entry = NULL; struct ext4_xattr_info i = { @@ -1366,7 +1394,7 @@ retry: EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + EXT4_XATTR_LEN(last->e_name_len); if (total_size <= free && total_size < min_total_size) { - if (total_size < new_extra_isize) { + if (total_size < isize_diff) { small_entry = last; } else { entry = last; @@ -1421,22 +1449,22 @@ retry: error = ext4_xattr_ibody_set(handle, inode, &i, is); if (error) goto cleanup; + total_ino -= entry_size; entry = IFIRST(header); - if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) - shift_bytes = new_extra_isize; + if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff) + shift_bytes = isize_diff; else - shift_bytes = entry_size + size; + shift_bytes = entry_size + EXT4_XATTR_SIZE(size); /* Adjust the offsets and shift the remaining entries ahead */ - ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - - shift_bytes, (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes, - (void *)header, total_ino - entry_size, - inode->i_sb->s_blocksize); + ext4_xattr_shift_entries(entry, -shift_bytes, + (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize + shift_bytes, + (void *)header, total_ino, inode->i_sb->s_blocksize); - extra_isize += shift_bytes; - new_extra_isize -= shift_bytes; - EXT4_I(inode)->i_extra_isize = extra_isize; + isize_diff -= shift_bytes; + EXT4_I(inode)->i_extra_isize += shift_bytes; + header = IHDR(inode, raw_inode); i.name = b_entry_name; i.value = buffer; @@ -1458,6 +1486,8 @@ retry: kfree(bs); } brelse(bh); +out: + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); up_write(&EXT4_I(inode)->xattr_sem); return 0; @@ -1469,6 +1499,10 @@ cleanup: kfree(is); kfree(bs); brelse(bh); + /* + * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode + * size expansion failed. + */ up_write(&EXT4_I(inode)->xattr_sem); return error; } diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index c8f25f7241f0..83dcf7bfd7b8 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -213,13 +213,11 @@ static int __f2fs_set_acl(struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + if (acl && !ipage) { + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; set_acl_inode(fi, inode->i_mode); - if (error == 0) - acl = NULL; } break; diff --git a/fs/f2fs/crypto_fname.c b/fs/f2fs/crypto_fname.c index ab377d496a39..38349ed5ea51 100644 --- a/fs/f2fs/crypto_fname.c +++ b/fs/f2fs/crypto_fname.c @@ -333,7 +333,7 @@ int f2fs_fname_disk_to_usr(struct inode *inode, memset(buf + 4, 0, 4); } else memset(buf, 0, 8); - memcpy(buf + 8, iname->name + iname->len - 16, 16); + memcpy(buf + 8, iname->name + ((iname->len - 17) & ~15), 16); oname->name[0] = '_'; ret = digest_encode(buf, 24, oname->name + 1); oname->len = ret + 1; diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c index 5de2d866a25c..18595d7a0efc 100644 --- a/fs/f2fs/crypto_key.c +++ b/fs/f2fs/crypto_key.c @@ -92,7 +92,6 @@ static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci) if (!ci) return; - key_put(ci->ci_keyring_key); crypto_free_ablkcipher(ci->ci_ctfm); kmem_cache_free(f2fs_crypt_info_cachep, ci); } @@ -113,7 +112,7 @@ void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci) f2fs_free_crypt_info(ci); } -int _f2fs_get_encryption_info(struct inode *inode) +int f2fs_get_encryption_info(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_crypt_info *crypt_info; @@ -129,18 +128,12 @@ int _f2fs_get_encryption_info(struct inode *inode) char mode; int res; + if (fi->i_crypt_info) + return 0; + res = f2fs_crypto_initialize(); if (res) return res; -retry: - crypt_info = ACCESS_ONCE(fi->i_crypt_info); - if (crypt_info) { - if (!crypt_info->ci_keyring_key || - key_validate(crypt_info->ci_keyring_key) == 0) - return 0; - f2fs_free_encryption_info(inode, crypt_info); - goto retry; - } res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, @@ -159,7 +152,6 @@ retry: crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; - crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); if (S_ISREG(inode->i_mode)) @@ -197,7 +189,6 @@ retry: keyring_key = NULL; goto out; } - crypt_info->ci_keyring_key = keyring_key; BUG_ON(keyring_key->type != &key_type_logon); ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct f2fs_encryption_key)) { @@ -230,17 +221,12 @@ retry: if (res) goto out; - memzero_explicit(raw_key, sizeof(raw_key)); - if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) != NULL) { - f2fs_free_crypt_info(crypt_info); - goto retry; - } - return 0; - + if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) == NULL) + crypt_info = NULL; out: if (res == -ENOKEY && !S_ISREG(inode->i_mode)) res = 0; - + key_put(keyring_key); f2fs_free_crypt_info(crypt_info); memzero_explicit(raw_key, sizeof(raw_key)); return res; diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c index d4a96af513c2..884f3f0fe29d 100644 --- a/fs/f2fs/crypto_policy.c +++ b/fs/f2fs/crypto_policy.c @@ -89,6 +89,9 @@ static int f2fs_create_encryption_context_from_policy( int f2fs_process_policy(const struct f2fs_encryption_policy *policy, struct inode *inode) { + if (!inode_owner_or_capable(inode)) + return -EACCES; + if (policy->version != 0) return -EINVAL; @@ -138,20 +141,38 @@ int f2fs_get_policy(struct inode *inode, struct f2fs_encryption_policy *policy) int f2fs_is_child_context_consistent_with_parent(struct inode *parent, struct inode *child) { - struct f2fs_crypt_info *parent_ci, *child_ci; + const struct f2fs_crypt_info *parent_ci, *child_ci; + struct f2fs_encryption_context parent_ctx, child_ctx; int res; - if ((parent == NULL) || (child == NULL)) { - pr_err("parent %p child %p\n", parent, child); - BUG_ON(1); - } + /* No restrictions on file types which are never encrypted */ + if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && + !S_ISLNK(child->i_mode)) + return 1; - /* no restrictions if the parent directory is not encrypted */ + /* No restrictions if the parent directory is unencrypted */ if (!f2fs_encrypted_inode(parent)) return 1; - /* if the child directory is not encrypted, this is always a problem */ + + /* Encrypted directories must not contain unencrypted files */ if (!f2fs_encrypted_inode(child)) return 0; + + /* + * Both parent and child are encrypted, so verify they use the same + * encryption policy. Compare the fscrypt_info structs if the keys are + * available, otherwise retrieve and compare the fscrypt_contexts. + * + * Note that the fscrypt_context retrieval will be required frequently + * when accessing an encrypted directory tree without the key. + * Performance-wise this is not a big deal because we already don't + * really optimize for file access without the key (to the extent that + * such access is even possible), given that any attempted access + * already causes a fscrypt_context retrieval and keyring search. + * + * In any case, if an unexpected error occurs, fall back to "forbidden". + */ + res = f2fs_get_encryption_info(parent); if (res) return 0; @@ -160,17 +181,35 @@ int f2fs_is_child_context_consistent_with_parent(struct inode *parent, return 0; parent_ci = F2FS_I(parent)->i_crypt_info; child_ci = F2FS_I(child)->i_crypt_info; - if (!parent_ci && !child_ci) - return 1; - if (!parent_ci || !child_ci) + if (parent_ci && child_ci) { + return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key, + F2FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == + child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags); + } + + res = f2fs_getxattr(parent, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + &parent_ctx, sizeof(parent_ctx), NULL); + if (res != sizeof(parent_ctx)) + return 0; + + res = f2fs_getxattr(child, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + &child_ctx, sizeof(child_ctx), NULL); + if (res != sizeof(child_ctx)) return 0; - return (memcmp(parent_ci->ci_master_key, - child_ci->ci_master_key, - F2FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags)); + return memcmp(parent_ctx.master_key_descriptor, + child_ctx.master_key_descriptor, + F2FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ctx.contents_encryption_mode == + child_ctx.contents_encryption_mode) && + (parent_ctx.filenames_encryption_mode == + child_ctx.filenames_encryption_mode) && + (parent_ctx.flags == child_ctx.flags); } /** diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 478e5d54154f..24d6a51b48d1 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -352,6 +352,7 @@ static int stat_open(struct inode *inode, struct file *file) } static const struct file_operations stat_fops = { + .owner = THIS_MODULE, .open = stat_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7c1678ba8f92..60972a559685 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -124,19 +124,29 @@ struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname, de = &d->dentry[bit_pos]; - /* encrypted case */ + if (de->hash_code != namehash) + goto not_match; + de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); - /* show encrypted name */ - if (fname->hash) { - if (de->hash_code == fname->hash) - goto found; - } else if (de_name.len == name->len && - de->hash_code == namehash && - !memcmp(de_name.name, name->name, name->len)) +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (unlikely(!name->name)) { + if (fname->usr_fname->name[0] == '_') { + if (de_name.len > 32 && + !memcmp(de_name.name + ((de_name.len - 17) & ~15), + fname->crypto_buf.name + 8, 16)) + goto found; + goto not_match; + } + name->name = fname->crypto_buf.name; + name->len = fname->crypto_buf.len; + } +#endif + if (de_name.len == name->len && + !memcmp(de_name.name, name->name, name->len)) goto found; - +not_match: if (max_slots && max_len > *max_slots) *max_slots = max_len; max_len = 0; @@ -170,7 +180,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, int max_slots; f2fs_hash_t namehash; - namehash = f2fs_dentry_hash(&name); + namehash = f2fs_dentry_hash(&name, fname); f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH); @@ -547,7 +557,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, level = 0; slots = GET_DENTRY_SLOTS(new_name.len); - dentry_hash = f2fs_dentry_hash(&new_name); + dentry_hash = f2fs_dentry_hash(&new_name, NULL); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9db5500d63d9..2871576fbca4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1722,7 +1722,8 @@ void f2fs_msg(struct super_block *, const char *, const char *, ...); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct qstr *); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct f2fs_filename *fname); /* * node.c @@ -2149,7 +2150,6 @@ void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *); /* crypto_key.c */ void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *); -int _f2fs_get_encryption_info(struct inode *inode); /* crypto_fname.c */ bool f2fs_valid_filenames_enc_mode(uint32_t); @@ -2170,18 +2170,7 @@ void f2fs_exit_crypto(void); int f2fs_has_encryption_key(struct inode *); -static inline int f2fs_get_encryption_info(struct inode *inode) -{ - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - - if (!ci || - (ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD))))) - return _f2fs_get_encryption_info(inode); - return 0; -} +int f2fs_get_encryption_info(struct inode *inode); void f2fs_fname_crypto_free_buffer(struct f2fs_str *); int f2fs_fname_setup_filename(struct inode *, const struct qstr *, diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h index c2c1c2b63b25..f113f1a1e8c1 100644 --- a/fs/f2fs/f2fs_crypto.h +++ b/fs/f2fs/f2fs_crypto.h @@ -79,7 +79,6 @@ struct f2fs_crypt_info { char ci_filename_mode; char ci_flags; struct crypto_ablkcipher *ci_ctfm; - struct key *ci_keyring_key; char ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE]; }; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a197215ad52b..4b449d263333 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1535,12 +1535,19 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) #ifdef CONFIG_F2FS_FS_ENCRYPTION struct f2fs_encryption_policy policy; struct inode *inode = file_inode(filp); + int err; if (copy_from_user(&policy, (struct f2fs_encryption_policy __user *)arg, sizeof(policy))) return -EFAULT; - return f2fs_process_policy(&policy, inode); + mutex_lock(&inode->i_mutex); + + err = f2fs_process_policy(&policy, inode); + + mutex_unlock(&inode->i_mutex); + + return err; #else return -EOPNOTSUPP; #endif diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 71b7206c431e..b238d2fec3e5 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -70,7 +70,8 @@ static void str2hashbuf(const unsigned char *msg, size_t len, *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct f2fs_filename *fname) { __u32 hash; f2fs_hash_t f2fs_hash; @@ -79,6 +80,10 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) const unsigned char *name = name_info->name; size_t len = name_info->len; + /* encrypted bigname case */ + if (fname && !fname->disk_name.name) + return cpu_to_le32(fname->hash); + if (is_dot_dotdot(name_info)) return 0; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index bda7126466c0..ad80f916b64d 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -303,7 +303,7 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, if (IS_ERR(ipage)) return NULL; - namehash = f2fs_dentry_hash(&name); + namehash = f2fs_dentry_hash(&name, fname); inline_dentry = inline_data_addr(ipage); @@ -468,7 +468,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, f2fs_wait_on_page_writeback(ipage, NODE); - name_hash = f2fs_dentry_hash(name); + name_hash = f2fs_dentry_hash(name, NULL); make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3a65e0132352..4f666368aa85 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -918,6 +918,79 @@ static loff_t max_file_size(unsigned bits) return result; } +static inline bool sanity_check_area_boundary(struct super_block *sb, + struct f2fs_super_block *raw_super) +{ + u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); + u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr); + u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr); + u32 nat_blkaddr = le32_to_cpu(raw_super->nat_blkaddr); + u32 ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + u32 main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); + u32 segment_count_ckpt = le32_to_cpu(raw_super->segment_count_ckpt); + u32 segment_count_sit = le32_to_cpu(raw_super->segment_count_sit); + u32 segment_count_nat = le32_to_cpu(raw_super->segment_count_nat); + u32 segment_count_ssa = le32_to_cpu(raw_super->segment_count_ssa); + u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main); + u32 segment_count = le32_to_cpu(raw_super->segment_count); + u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + + if (segment0_blkaddr != cp_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Mismatch start address, segment0(%u) cp_blkaddr(%u)", + segment0_blkaddr, cp_blkaddr); + return true; + } + + if (cp_blkaddr + (segment_count_ckpt << log_blocks_per_seg) != + sit_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong CP boundary, start(%u) end(%u) blocks(%u)", + cp_blkaddr, sit_blkaddr, + segment_count_ckpt << log_blocks_per_seg); + return true; + } + + if (sit_blkaddr + (segment_count_sit << log_blocks_per_seg) != + nat_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong SIT boundary, start(%u) end(%u) blocks(%u)", + sit_blkaddr, nat_blkaddr, + segment_count_sit << log_blocks_per_seg); + return true; + } + + if (nat_blkaddr + (segment_count_nat << log_blocks_per_seg) != + ssa_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong NAT boundary, start(%u) end(%u) blocks(%u)", + nat_blkaddr, ssa_blkaddr, + segment_count_nat << log_blocks_per_seg); + return true; + } + + if (ssa_blkaddr + (segment_count_ssa << log_blocks_per_seg) != + main_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong SSA boundary, start(%u) end(%u) blocks(%u)", + ssa_blkaddr, main_blkaddr, + segment_count_ssa << log_blocks_per_seg); + return true; + } + + if (main_blkaddr + (segment_count_main << log_blocks_per_seg) != + segment0_blkaddr + (segment_count << log_blocks_per_seg)) { + f2fs_msg(sb, KERN_INFO, + "Wrong MAIN_AREA boundary, start(%u) end(%u) blocks(%u)", + main_blkaddr, + segment0_blkaddr + (segment_count << log_blocks_per_seg), + segment_count_main << log_blocks_per_seg); + return true; + } + + return false; +} + static int sanity_check_raw_super(struct super_block *sb, struct f2fs_super_block *raw_super) { @@ -947,6 +1020,14 @@ static int sanity_check_raw_super(struct super_block *sb, return 1; } + /* check log blocks per segment */ + if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) { + f2fs_msg(sb, KERN_INFO, + "Invalid log blocks per segment (%u)\n", + le32_to_cpu(raw_super->log_blocks_per_seg)); + return 1; + } + /* Currently, support 512/1024/2048/4096 bytes sector size */ if (le32_to_cpu(raw_super->log_sectorsize) > F2FS_MAX_LOG_SECTOR_SIZE || @@ -965,6 +1046,30 @@ static int sanity_check_raw_super(struct super_block *sb, le32_to_cpu(raw_super->log_sectorsize)); return 1; } + + /* check reserved ino info */ + if (le32_to_cpu(raw_super->node_ino) != 1 || + le32_to_cpu(raw_super->meta_ino) != 2 || + le32_to_cpu(raw_super->root_ino) != 3) { + f2fs_msg(sb, KERN_INFO, + "Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)", + le32_to_cpu(raw_super->node_ino), + le32_to_cpu(raw_super->meta_ino), + le32_to_cpu(raw_super->root_ino)); + return 1; + } + + if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment count (%u)", + le32_to_cpu(raw_super->segment_count)); + return 1; + } + + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ + if (sanity_check_area_boundary(sb, raw_super)) + return 1; + return 0; } @@ -973,6 +1078,8 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned int main_segs, blocks_per_seg; + int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -984,6 +1091,20 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; + main_segs = le32_to_cpu(raw_super->segment_count_main); + blocks_per_seg = sbi->blocks_per_seg; + + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) + return 1; + } + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 509411dd3698..cf644d52c0cf 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1269,6 +1269,16 @@ out: return 0; } +static void fat_dummy_inode_init(struct inode *inode) +{ + /* Initialize this dummy inode to work as no-op. */ + MSDOS_I(inode)->mmu_private = 0; + MSDOS_I(inode)->i_start = 0; + MSDOS_I(inode)->i_logstart = 0; + MSDOS_I(inode)->i_attrs = 0; + MSDOS_I(inode)->i_pos = 0; +} + static int fat_read_root(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); @@ -1713,12 +1723,13 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, fat_inode = new_inode(sb); if (!fat_inode) goto out_fail; - MSDOS_I(fat_inode)->i_pos = 0; + fat_dummy_inode_init(fat_inode); sbi->fat_inode = fat_inode; fsinfo_inode = new_inode(sb); if (!fsinfo_inode) goto out_fail; + fat_dummy_inode_init(fsinfo_inode); fsinfo_inode->i_ino = MSDOS_FSINFO_INO; sbi->fsinfo_inode = fsinfo_inode; insert_inode_hash(fsinfo_inode); diff --git a/fs/fcntl.c b/fs/fcntl.c index ee85cd4e136a..62376451bbce 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -740,16 +740,10 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( - O_RDONLY | O_WRONLY | O_RDWR | - O_CREAT | O_EXCL | O_NOCTTY | - O_TRUNC | O_APPEND | /* O_NONBLOCK | */ - __O_SYNC | O_DSYNC | FASYNC | - O_DIRECT | O_LARGEFILE | O_DIRECTORY | - O_NOFOLLOW | O_NOATIME | O_CLOEXEC | - __FMODE_EXEC | O_PATH | __O_TMPFILE | - __FMODE_NONOTIFY - )); + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != + HWEIGHT32( + (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) | + __FMODE_EXEC | __FMODE_NONOTIFY)); fasync_cache = kmem_cache_create("fasync_cache", sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); diff --git a/fs/fhandle.c b/fs/fhandle.c index d59712dfa3e7..ca3c3dd01789 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -228,7 +228,7 @@ long do_handle_open(int mountdirfd, path_put(&path); return fd; } - file = file_open_root(path.dentry, path.mnt, "", open_flag); + file = file_open_root(path.dentry, path.mnt, "", open_flag, 0); if (IS_ERR(file)) { put_unused_fd(fd); retval = PTR_ERR(file); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7a8ea1351584..60d6fc2e0e4b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -281,13 +281,15 @@ locked_inode_to_wb_and_lock_list(struct inode *inode) wb_get(wb); spin_unlock(&inode->i_lock); spin_lock(&wb->list_lock); - wb_put(wb); /* not gonna deref it anymore */ /* i_wb may have changed inbetween, can't use inode_to_wb() */ - if (likely(wb == inode->i_wb)) - return wb; /* @inode already has ref */ + if (likely(wb == inode->i_wb)) { + wb_put(wb); /* @inode already has ref */ + return wb; + } spin_unlock(&wb->list_lock); + wb_put(wb); cpu_relax(); spin_lock(&inode->i_lock); } @@ -1339,10 +1341,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() * and does more profound writeback list handling in writeback_sb_inodes(). */ -static int -writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, - struct writeback_control *wbc) +static int writeback_single_inode(struct inode *inode, + struct writeback_control *wbc) { + struct bdi_writeback *wb; int ret = 0; spin_lock(&inode->i_lock); @@ -1380,7 +1382,8 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, ret = __writeback_single_inode(inode, wbc); wbc_detach_inode(wbc); - spin_lock(&wb->list_lock); + + wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); /* * If inode is clean, remove it from writeback lists. Otherwise don't @@ -1455,6 +1458,7 @@ static long writeback_sb_inodes(struct super_block *sb, while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); + struct bdi_writeback *tmp_wb; if (inode->i_sb != sb) { if (work->sb) { @@ -1545,15 +1549,23 @@ static long writeback_sb_inodes(struct super_block *sb, cond_resched(); } - - spin_lock(&wb->list_lock); + /* + * Requeue @inode if still dirty. Be careful as @inode may + * have been switched to another wb in the meantime. + */ + tmp_wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); if (!(inode->i_state & I_DIRTY_ALL)) wrote++; - requeue_inode(inode, wb, &wbc); + requeue_inode(inode, tmp_wb, &wbc); inode_sync_complete(inode); spin_unlock(&inode->i_lock); + if (unlikely(tmp_wb != wb)) { + spin_unlock(&tmp_wb->list_lock); + spin_lock(&wb->list_lock); + } + /* * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. @@ -2340,7 +2352,6 @@ EXPORT_SYMBOL(sync_inodes_sb); */ int write_inode_now(struct inode *inode, int sync) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; struct writeback_control wbc = { .nr_to_write = LONG_MAX, .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, @@ -2352,7 +2363,7 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - return writeback_single_inode(inode, wb, &wbc); + return writeback_single_inode(inode, &wbc); } EXPORT_SYMBOL(write_inode_now); @@ -2369,7 +2380,7 @@ EXPORT_SYMBOL(write_inode_now); */ int sync_inode(struct inode *inode, struct writeback_control *wbc) { - return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc); + return writeback_single_inode(inode, wbc); } EXPORT_SYMBOL(sync_inode); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 4304072161aa..40d61077bead 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -542,6 +542,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { if (invalidate) set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); } } else { @@ -560,6 +561,10 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t, TASK_UNINTERRUPTIBLE); + /* Make sure any pending writes are cancelled. */ + if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) + fscache_invalidate_writes(cookie); + /* Reset the cookie state if it wasn't relinquished */ if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) { atomic_inc(&cookie->n_active); diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index 9b28649df3a1..a8aa00be4444 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c @@ -48,6 +48,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) cookie->flags = 1 << FSCACHE_COOKIE_ENABLED; spin_lock_init(&cookie->lock); + spin_lock_init(&cookie->stores_lock); INIT_HLIST_HEAD(&cookie->backing_objects); /* check the netfs type is not already present */ diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 9e792e30f4db..7a182c87f378 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -30,6 +30,7 @@ static const struct fscache_state *fscache_look_up_object(struct fscache_object static const struct fscache_state *fscache_object_available(struct fscache_object *, int); static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int); static const struct fscache_state *fscache_update_object(struct fscache_object *, int); +static const struct fscache_state *fscache_object_dead(struct fscache_object *, int); #define __STATE_NAME(n) fscache_osm_##n #define STATE(n) (&__STATE_NAME(n)) @@ -91,7 +92,7 @@ static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure); static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object); static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents); static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object); -static WORK_STATE(OBJECT_DEAD, "DEAD", (void*)2UL); +static WORK_STATE(OBJECT_DEAD, "DEAD", fscache_object_dead); static WAIT_STATE(WAIT_FOR_INIT, "?INI", TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD)); @@ -229,6 +230,10 @@ execute_work_state: event = -1; if (new_state == NO_TRANSIT) { _debug("{OBJ%x} %s notrans", object->debug_id, state->name); + if (unlikely(state == STATE(OBJECT_DEAD))) { + _leave(" [dead]"); + return; + } fscache_enqueue_object(object); event_mask = object->oob_event_mask; goto unmask_events; @@ -239,7 +244,7 @@ execute_work_state: object->state = state = new_state; if (state->work) { - if (unlikely(state->work == ((void *)2UL))) { + if (unlikely(state == STATE(OBJECT_DEAD))) { _leave(" [dead]"); return; } @@ -645,6 +650,12 @@ static const struct fscache_state *fscache_kill_object(struct fscache_object *ob fscache_mark_object_dead(object); object->oob_event_mask = 0; + if (test_bit(FSCACHE_OBJECT_RETIRED, &object->flags)) { + /* Reject any new read/write ops and abort any that are pending. */ + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); + fscache_cancel_all_ops(object); + } + if (list_empty(&object->dependents) && object->n_ops == 0 && object->n_children == 0) @@ -1077,3 +1088,20 @@ void fscache_object_mark_killed(struct fscache_object *object, } } EXPORT_SYMBOL(fscache_object_mark_killed); + +/* + * The object is dead. We can get here if an object gets queued by an event + * that would lead to its death (such as EV_KILL) when the dispatcher is + * already running (and so can be requeued) but hasn't yet cleared the event + * mask. + */ +static const struct fscache_state *fscache_object_dead(struct fscache_object *object, + int event) +{ + if (!test_and_set_bit(FSCACHE_OBJECT_RUN_AFTER_DEAD, + &object->flags)) + return NO_TRANSIT; + + WARN(true, "FS-Cache object redispatched after death"); + return NO_TRANSIT; +} diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 8e3ee1936c7e..c5b6b7165489 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -90,7 +90,7 @@ static struct list_head *cuse_conntbl_head(dev_t devt) static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to) { - struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp }; + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp); loff_t pos = 0; return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE); @@ -98,7 +98,7 @@ static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to) static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from) { - struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp }; + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp); loff_t pos = 0; /* * No locking or generic_write_checks(), the server is diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ebb5e37455a0..d0cf1f010fbe 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -418,6 +418,10 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->waitq.lock); + if (test_bit(FR_FINISHED, &req->flags)) { + spin_unlock(&fiq->waitq.lock); + return; + } if (list_empty(&req->intr_entry)) { list_add_tail(&req->intr_entry, &fiq->interrupts); wake_up_locked(&fiq->waitq); @@ -2083,7 +2087,6 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) struct fuse_req *req; req = list_entry(head->next, struct fuse_req, list); req->out.h.error = -ECONNABORTED; - clear_bit(FR_PENDING, &req->flags); clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); request_end(fc, req); @@ -2161,6 +2164,8 @@ void fuse_abort_conn(struct fuse_conn *fc) spin_lock(&fiq->waitq.lock); fiq->connected = 0; list_splice_init(&fiq->pending, &to_end2); + list_for_each_entry(req, &to_end2, list) + clear_bit(FR_PENDING, &req->flags); while (forget_pending(fiq)) kfree(dequeue_forget(fiq, 1, NULL)); wake_up_all_locked(&fiq->waitq); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5e2e08712d3b..4b5f2c4e69c8 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1697,14 +1697,46 @@ error: static int fuse_setattr(struct dentry *entry, struct iattr *attr) { struct inode *inode = d_inode(entry); + struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL; + int ret; if (!fuse_allow_current_process(get_fuse_conn(inode))) return -EACCES; - if (attr->ia_valid & ATTR_FILE) - return fuse_do_setattr(inode, attr, attr->ia_file); - else - return fuse_do_setattr(inode, attr, NULL); + if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) { + int kill; + + attr->ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | + ATTR_MODE); + /* + * ia_mode calculation may have used stale i_mode. Refresh and + * recalculate. + */ + ret = fuse_do_getattr(inode, NULL, file); + if (ret) + return ret; + + attr->ia_mode = inode->i_mode; + kill = should_remove_suid(entry); + if (kill & ATTR_KILL_SUID) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode &= ~S_ISUID; + } + if (kill & ATTR_KILL_SGID) { + attr->ia_valid |= ATTR_MODE; + attr->ia_mode &= ~S_ISGID; + } + } + if (!attr->ia_valid) + return 0; + + ret = fuse_do_setattr(inode, attr, file); + if (!ret) { + /* Directory mode changed, may need to revalidate access */ + if (d_is_dir(entry) && (attr->ia_valid & ATTR_MODE)) + fuse_invalidate_entry_cache(entry); + } + return ret; } static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, @@ -1797,6 +1829,23 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, return ret; } +static int fuse_verify_xattr_list(char *list, size_t size) +{ + size_t origsize = size; + + while (size) { + size_t thislen = strnlen(list, size); + + if (!thislen || thislen == size) + return -EIO; + + size -= thislen + 1; + list += thislen + 1; + } + + return origsize; +} + static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) { struct inode *inode = d_inode(entry); @@ -1832,6 +1881,8 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) ret = fuse_simple_request(fc, &args); if (!ret && !size) ret = outarg.size; + if (ret > 0 && size) + ret = fuse_verify_xattr_list(list, ret); if (ret == -ENOSYS) { fc->no_listxattr = 1; ret = -EOPNOTSUPP; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 570ca4053c80..1a063cbfe503 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -46,7 +46,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; - ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); + ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL); if (unlikely(!ff)) return NULL; @@ -100,6 +100,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) iput(req->misc.release.inode); fuse_put_request(ff->fc, req); } else if (sync) { + __set_bit(FR_FORCE, &req->flags); __clear_bit(FR_BACKGROUND, &req->flags); fuse_request_send(ff->fc, req); iput(req->misc.release.inode); @@ -417,6 +418,15 @@ static int fuse_flush(struct file *file, fl_owner_t id) fuse_sync_writes(inode); mutex_unlock(&inode->i_mutex); + if (test_bit(AS_ENOSPC, &file->f_mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &file->f_mapping->flags)) + err = -ENOSPC; + if (test_bit(AS_EIO, &file->f_mapping->flags) && + test_and_clear_bit(AS_EIO, &file->f_mapping->flags)) + err = -EIO; + if (err) + return err; + req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; @@ -462,6 +472,21 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, goto out; fuse_sync_writes(inode); + + /* + * Due to implementation of fuse writeback + * filemap_write_and_wait_range() does not catch errors. + * We have to do this directly after fuse_sync_writes() + */ + if (test_bit(AS_ENOSPC, &file->f_mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &file->f_mapping->flags)) + err = -ENOSPC; + if (test_bit(AS_EIO, &file->f_mapping->flags) && + test_and_clear_bit(AS_EIO, &file->f_mapping->flags)) + err = -EIO; + if (err) + goto out; + err = sync_inode_metadata(inode, 1); if (err) goto out; @@ -516,18 +541,23 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, req->out.args[0].size = count; } -static void fuse_release_user_pages(struct fuse_req *req, int write) +static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty) { unsigned i; for (i = 0; i < req->num_pages; i++) { struct page *page = req->pages[i]; - if (write) + if (should_dirty) set_page_dirty_lock(page); put_page(page); } } +static void fuse_io_release(struct kref *kref) +{ + kfree(container_of(kref, struct fuse_io_priv, refcnt)); +} + static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) { if (io->err) @@ -585,8 +615,9 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) } io->iocb->ki_complete(io->iocb, res, 0); - kfree(io); } + + kref_put(&io->refcnt, fuse_io_release); } static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) @@ -613,6 +644,7 @@ static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req, size_t num_bytes, struct fuse_io_priv *io) { spin_lock(&io->lock); + kref_get(&io->refcnt); io->size += num_bytes; io->reqs++; spin_unlock(&io->lock); @@ -691,7 +723,7 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode, static int fuse_do_readpage(struct file *file, struct page *page) { - struct fuse_io_priv io = { .async = 0, .file = file }; + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file); struct inode *inode = page->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; @@ -984,7 +1016,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, size_t res; unsigned offset; unsigned i; - struct fuse_io_priv io = { .async = 0, .file = file }; + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file); for (i = 0; i < req->num_pages; i++) fuse_wait_on_page_writeback(inode, req->pages[i]->index); @@ -1300,6 +1332,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos, int flags) { int write = flags & FUSE_DIO_WRITE; + bool should_dirty = !write && iter_is_iovec(iter); int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->file; struct inode *inode = file->f_mapping->host; @@ -1344,7 +1377,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, nres = fuse_send_read(req, io, pos, nbytes, owner); if (!io->async) - fuse_release_user_pages(req, !write); + fuse_release_user_pages(req, should_dirty); if (req->out.h.error) { if (!res) res = req->out.h.error; @@ -1398,7 +1431,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct fuse_io_priv io = { .async = 0, .file = iocb->ki_filp }; + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb->ki_filp); return __fuse_direct_read(&io, to, &iocb->ki_pos); } @@ -1406,7 +1439,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct fuse_io_priv io = { .async = 0, .file = file }; + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file); ssize_t res; if (is_bad_inode(inode)) @@ -1965,6 +1998,10 @@ static int fuse_write_end(struct file *file, struct address_space *mapping, { struct inode *inode = page->mapping->host; + /* Haven't copied anything? Skip zeroing, size extending, dirtying. */ + if (!copied) + goto unlock; + if (!PageUptodate(page)) { /* Zero any unwritten bytes at the end of the page */ size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK; @@ -1975,6 +2012,8 @@ static int fuse_write_end(struct file *file, struct address_space *mapping, fuse_write_update_size(inode, pos + copied); set_page_dirty(page); + +unlock: unlock_page(page); page_cache_release(page); @@ -2786,6 +2825,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) loff_t i_size; size_t count = iov_iter_count(iter); struct fuse_io_priv *io; + bool is_sync = is_sync_kiocb(iocb); pos = offset; inode = file->f_mapping->host; @@ -2806,6 +2846,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) if (!io) return -ENOMEM; spin_lock_init(&io->lock); + kref_init(&io->refcnt); io->reqs = 1; io->bytes = -1; io->size = 0; @@ -2825,12 +2866,18 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) * to wait on real async I/O requests, so we must submit this request * synchronously. */ - if (!is_sync_kiocb(iocb) && (offset + count > i_size) && + if (!is_sync && (offset + count > i_size) && iov_iter_rw(iter) == WRITE) io->async = false; - if (io->async && is_sync_kiocb(iocb)) + if (io->async && is_sync) { + /* + * Additional reference to keep io around after + * calling fuse_aio_complete() + */ + kref_get(&io->refcnt); io->done = &wait; + } if (iov_iter_rw(iter) == WRITE) { ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE); @@ -2843,14 +2890,14 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) fuse_aio_complete(io, ret < 0 ? ret : 0, -1); /* we have a non-extending, async request, so return */ - if (!is_sync_kiocb(iocb)) + if (!is_sync) return -EIOCBQUEUED; wait_for_completion(&wait); ret = fuse_get_res_by_io(io); } - kfree(io); + kref_put(&io->refcnt, fuse_io_release); if (iov_iter_rw(iter) == WRITE) { if (ret > 0) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 405113101db8..604cd42dafef 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -22,6 +22,7 @@ #include <linux/rbtree.h> #include <linux/poll.h> #include <linux/workqueue.h> +#include <linux/kref.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -243,6 +244,7 @@ struct fuse_args { /** The request IO state (for asynchronous processing) */ struct fuse_io_priv { + struct kref refcnt; int async; spinlock_t lock; unsigned reqs; @@ -256,6 +258,13 @@ struct fuse_io_priv { struct completion *done; }; +#define FUSE_IO_PRIV_SYNC(f) \ +{ \ + .refcnt = { ATOMIC_INIT(1) }, \ + .async = 0, \ + .file = f, \ +} + /** * Request flags * diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2913db2a5b99..0d5e8e59b390 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -926,7 +926,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | - FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | + FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT; req->in.h.opcode = FUSE_INIT; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 1be3b061c05c..ff0ac96a8e7b 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -79,17 +79,11 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (type == ACL_TYPE_ACCESS) { umode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; - - if (error == 0) - acl = NULL; - - if (mode != inode->i_mode) { - inode->i_mode = mode; + if (mode != inode->i_mode) mark_inode_dirty(inode); - } } if (acl) { diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index ad8a5b757cc7..a443c6e54412 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -760,7 +760,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index, int error; error = get_leaf_nr(dip, index, &leaf_no); - if (!error) + if (!IS_ERR_VALUE(error)) error = get_leaf(dip, leaf_no, bh_out); return error; @@ -976,7 +976,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) index = name->hash >> (32 - dip->i_depth); error = get_leaf_nr(dip, index, &leaf_no); - if (error) + if (IS_ERR_VALUE(error)) return error; /* Get the old leaf block */ diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 32e74710b1aa..070901e76653 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -80,9 +80,9 @@ static struct rhashtable_params ht_parms = { static struct rhashtable gl_hash_table; -void gfs2_glock_free(struct gfs2_glock *gl) +static void gfs2_glock_dealloc(struct rcu_head *rcu) { - struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); if (gl->gl_ops->go_flags & GLOF_ASPACE) { kmem_cache_free(gfs2_glock_aspace_cachep, gl); @@ -90,6 +90,13 @@ void gfs2_glock_free(struct gfs2_glock *gl) kfree(gl->gl_lksb.sb_lvbptr); kmem_cache_free(gfs2_glock_cachep, gl); } +} + +void gfs2_glock_free(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_glock_wait); } @@ -651,9 +658,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, struct kmem_cache *cachep; int ret, tries = 0; + rcu_read_lock(); gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); if (gl && !lockref_get_not_dead(&gl->gl_lockref)) gl = NULL; + rcu_read_unlock(); *glp = gl; if (gl) @@ -721,15 +730,18 @@ again: if (ret == -EEXIST) { ret = 0; + rcu_read_lock(); tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) { if (++tries < 100) { + rcu_read_unlock(); cond_resched(); goto again; } tmp = NULL; ret = -ENOMEM; } + rcu_read_unlock(); } else { WARN_ON_ONCE(ret); } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index de7b4f97ac75..4a9077ec9313 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -207,7 +207,7 @@ struct lm_lockname { struct gfs2_sbd *ln_sbd; u64 ln_number; unsigned int ln_type; -}; +} __packed __aligned(sizeof(int)); #define lm_name_equal(name1, name2) \ (((name1)->ln_number == (name2)->ln_number) && \ @@ -367,6 +367,7 @@ struct gfs2_glock { loff_t end; } gl_vm; }; + struct rcu_head gl_rcu; struct rhash_head gl_node; }; diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c index df0c9af68d05..71b3087b7e32 100644 --- a/fs/hfsplus/posix_acl.c +++ b/fs/hfsplus/posix_acl.c @@ -68,8 +68,8 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, case ACL_TYPE_ACCESS: xattr_name = POSIX_ACL_XATTR_ACCESS; if (acl) { - err = posix_acl_equiv_mode(acl, &inode->i_mode); - if (err < 0) + err = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (err) return err; } err = 0; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 5a7b3229b956..f34d6f5a5aca 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -959,10 +959,11 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) if (S_ISLNK(root_inode->i_mode)) { char *name = follow_link(host_root_path); - if (IS_ERR(name)) + if (IS_ERR(name)) { err = PTR_ERR(name); - else - err = read_name(root_inode, name); + goto out_put; + } + err = read_name(root_inode, name); kfree(name); if (err) goto out_put; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index a561591896bd..3713fd52b44b 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -15,6 +15,7 @@ #include <linux/sched.h> #include <linux/bitmap.h> #include <linux/slab.h> +#include <linux/seq_file.h> /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ @@ -453,10 +454,6 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) int lowercase, eas, chk, errs, chkdsk, timeshift; int o; struct hpfs_sb_info *sbi = hpfs_sb(s); - char *new_opts = kstrdup(data, GFP_KERNEL); - - if (!new_opts) - return -ENOMEM; sync_filesystem(s); @@ -493,17 +490,44 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) if (!(*flags & MS_RDONLY)) mark_dirty(s, 1); - replace_mount_options(s, new_opts); - hpfs_unlock(s); return 0; out_err: hpfs_unlock(s); - kfree(new_opts); return -EINVAL; } +static int hpfs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct hpfs_sb_info *sbi = hpfs_sb(root->d_sb); + + seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, sbi->sb_uid)); + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbi->sb_gid)); + seq_printf(seq, ",umask=%03o", (~sbi->sb_mode & 0777)); + if (sbi->sb_lowercase) + seq_printf(seq, ",case=lower"); + if (!sbi->sb_chk) + seq_printf(seq, ",check=none"); + if (sbi->sb_chk == 2) + seq_printf(seq, ",check=strict"); + if (!sbi->sb_err) + seq_printf(seq, ",errors=continue"); + if (sbi->sb_err == 2) + seq_printf(seq, ",errors=panic"); + if (!sbi->sb_chkdsk) + seq_printf(seq, ",chkdsk=no"); + if (sbi->sb_chkdsk == 2) + seq_printf(seq, ",chkdsk=always"); + if (!sbi->sb_eas) + seq_printf(seq, ",eas=no"); + if (sbi->sb_eas == 1) + seq_printf(seq, ",eas=ro"); + if (sbi->sb_timeshift) + seq_printf(seq, ",timeshift=%d", sbi->sb_timeshift); + return 0; +} + /* Super operations */ static const struct super_operations hpfs_sops = @@ -514,7 +538,7 @@ static const struct super_operations hpfs_sops = .put_super = hpfs_put_super, .statfs = hpfs_statfs, .remount_fs = hpfs_remount_fs, - .show_options = generic_show_options, + .show_options = hpfs_show_options, }; static int hpfs_fill_super(struct super_block *s, void *options, int silent) @@ -537,8 +561,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) int o; - save_mount_options(s, options); - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) { return -ENOMEM; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 595ebdb41846..a17da8b57fc6 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -191,7 +191,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } diff --git a/fs/inode.c b/fs/inode.c index 1be5f9003eb3..b0edef500590 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1733,8 +1733,8 @@ static int __remove_privs(struct dentry *dentry, int kill) */ int file_remove_privs(struct file *file) { - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = d_inode(dentry); + struct dentry *dentry = file_dentry(file); + struct inode *inode = file_inode(file); int kill; int error = 0; @@ -1742,7 +1742,7 @@ int file_remove_privs(struct file *file) if (IS_NOSEC(inode)) return 0; - kill = file_needs_remove_privs(file); + kill = dentry_needs_remove_privs(dentry); if (kill < 0) return kill; if (kill) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index d67a16f2a45d..350f67fb5b9c 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -690,6 +690,11 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) pri_bh = NULL; root_found: + /* We don't support read-write mounts */ + if (!(s->s_flags & MS_RDONLY)) { + error = -EACCES; + goto out_freebh; + } if (joliet_level && (pri == NULL || !opt.rock)) { /* This is the case of Joliet with the norock mount flag. @@ -1503,9 +1508,6 @@ struct inode *__isofs_iget(struct super_block *sb, static struct dentry *isofs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - /* We don't support read-write mounts */ - if (!(flags & MS_RDONLY)) - return ERR_PTR(-EACCES); return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super); } diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 735d7522a3a9..204659a5f6db 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -203,6 +203,8 @@ int get_rock_ridge_filename(struct iso_directory_record *de, int retnamlen = 0; int truncate = 0; int ret = 0; + char *p; + int len; if (!ISOFS_SB(inode->i_sb)->s_rock) return 0; @@ -267,12 +269,17 @@ repeat: rr->u.NM.flags); break; } - if ((strlen(retname) + rr->len - 5) >= 254) { + len = rr->len - 5; + if (retnamlen + len >= 254) { truncate = 1; break; } - strncat(retname, rr->u.NM.name, rr->len - 5); - retnamlen += rr->len - 5; + p = memchr(rr->u.NM.name, '\0', len); + if (unlikely(p)) + len = p - rr->u.NM.name; + memcpy(retname + retnamlen, rr->u.NM.name, len); + retnamlen += len; + retname[retnamlen] = '\0'; break; case SIG('R', 'E'): kfree(rs.buffer); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 36345fefa3ff..2d964ce45606 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -124,7 +124,7 @@ static int journal_submit_commit_record(journal_t *journal, struct commit_header *tmp; struct buffer_head *bh; int ret; - struct timespec now = current_kernel_time(); + struct timespec64 now = current_kernel_time64(); *cbh = NULL; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 81e622681c82..624a57a9c4aa 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1408,11 +1408,12 @@ out: /** * jbd2_mark_journal_empty() - Mark on disk journal as empty. * @journal: The journal to update. + * @write_op: With which operation should we write the journal sb * * Update a journal's dynamic superblock fields to show that journal is empty. * Write updated superblock to disk waiting for IO to complete. */ -static void jbd2_mark_journal_empty(journal_t *journal) +static void jbd2_mark_journal_empty(journal_t *journal, int write_op) { journal_superblock_t *sb = journal->j_superblock; @@ -1430,7 +1431,7 @@ static void jbd2_mark_journal_empty(journal_t *journal) sb->s_start = cpu_to_be32(0); read_unlock(&journal->j_state_lock); - jbd2_write_superblock(journal, WRITE_FUA); + jbd2_write_superblock(journal, write_op); /* Log is no longer empty */ write_lock(&journal->j_state_lock); @@ -1716,7 +1717,13 @@ int jbd2_journal_destroy(journal_t *journal) if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { mutex_lock(&journal->j_checkpoint_mutex); - jbd2_mark_journal_empty(journal); + + write_lock(&journal->j_state_lock); + journal->j_tail_sequence = + ++journal->j_transaction_sequence; + write_unlock(&journal->j_state_lock); + + jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } else err = -EIO; @@ -1975,7 +1982,7 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ - jbd2_mark_journal_empty(journal); + jbd2_mark_journal_empty(journal, WRITE_FUA); mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); J_ASSERT(!journal->j_running_transaction); @@ -2021,7 +2028,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (write) { /* Lock to make assertions happy... */ mutex_lock(&journal->j_checkpoint_mutex); - jbd2_mark_journal_empty(journal); + jbd2_mark_journal_empty(journal, WRITE_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index ca181e81c765..a2e724053919 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1156,6 +1156,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); + spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction == journal->j_committing_transaction) { /* first access by this transaction */ jh->b_modified = 0; @@ -1163,8 +1164,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "set next transaction"); spin_lock(&journal->j_list_lock); jh->b_next_transaction = transaction; + spin_unlock(&journal->j_list_lock); } - spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); /* @@ -1875,7 +1876,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) __blist_del_buffer(list, jh); jh->b_jlist = BJ_None; - if (test_clear_buffer_jbddirty(bh)) + if (transaction && is_journal_aborted(transaction->t_journal)) + clear_buffer_jbddirty(bh); + else if (test_clear_buffer_jbddirty(bh)) mark_buffer_dirty(bh); /* Expose it to the VM */ } diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 2f7a3c090489..f9f86f87d32b 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -235,9 +235,10 @@ int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: xprefix = JFFS2_XPREFIX_ACL_ACCESS; if (acl) { - umode_t mode = inode->i_mode; - rc = posix_acl_equiv_mode(acl, &mode); - if (rc < 0) + umode_t mode; + + rc = posix_acl_update_mode(inode, &mode, &acl); + if (rc) return rc; if (inode->i_mode != mode) { struct iattr attr; @@ -249,8 +250,6 @@ int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (rc < 0) return rc; } - if (rc == 0) - acl = NULL; } break; case ACL_TYPE_DEFAULT: diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 0c8ca830b113..9fad9f4fe883 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -84,13 +84,11 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, case ACL_TYPE_ACCESS: ea_name = POSIX_ACL_XATTR_ACCESS; if (acl) { - rc = posix_acl_equiv_mode(acl, &inode->i_mode); - if (rc < 0) + rc = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (rc) return rc; inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); - if (rc == 0) - acl = NULL; } break; case ACL_TYPE_DEFAULT: diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 8f9176caf098..c8d58c5ac8ae 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -758,7 +758,7 @@ static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data, sb->s_blocksize - offset : toread; tmp_bh.b_state = 0; - tmp_bh.b_size = 1 << inode->i_blkbits; + tmp_bh.b_size = i_blocksize(inode); err = jfs_get_block(inode, blk, &tmp_bh, 0); if (err) return err; @@ -798,7 +798,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, sb->s_blocksize - offset : towrite; tmp_bh.b_state = 0; - tmp_bh.b_size = 1 << inode->i_blkbits; + tmp_bh.b_size = i_blocksize(inode); err = jfs_get_block(inode, blk, &tmp_bh, 1); if (err) goto out; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 7247252ee9b1..6e9a912d394c 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -833,21 +833,35 @@ repeat: mutex_lock(&kernfs_mutex); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { + struct kernfs_node *parent; struct inode *inode; - struct dentry *dentry; + /* + * We want fsnotify_modify() on @kn but as the + * modifications aren't originating from userland don't + * have the matching @file available. Look up the inodes + * and generate the events manually. + */ inode = ilookup(info->sb, kn->ino); if (!inode) continue; - dentry = d_find_any_alias(inode); - if (dentry) { - fsnotify_parent(NULL, dentry, FS_MODIFY); - fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE, - NULL, 0); - dput(dentry); + parent = kernfs_get_parent(kn); + if (parent) { + struct inode *p_inode; + + p_inode = ilookup(info->sb, parent->ino); + if (p_inode) { + fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD, + inode, FSNOTIFY_EVENT_INODE, kn->name, 0); + iput(p_inode); + } + + kernfs_put(parent); } + fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE, + kn->name, 0); iput(inode); } diff --git a/fs/locks.c b/fs/locks.c index 6333263b7bc8..8eddae23e10b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1602,7 +1602,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr { struct file_lock *fl, *my_fl = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; diff --git a/fs/mount.h b/fs/mount.h index 14db05d424f7..37c64bbe840c 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -13,6 +13,8 @@ struct mnt_namespace { u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; u64 event; + unsigned int mounts; /* # of mounts in the namespace */ + unsigned int pending_mounts; }; struct mnt_pcp { @@ -55,6 +57,7 @@ struct mount { struct mnt_namespace *mnt_ns; /* containing namespace */ struct mountpoint *mnt_mp; /* where is it mounted */ struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ + struct list_head mnt_umounting; /* list entry for umount propagation */ #ifdef CONFIG_FSNOTIFY struct hlist_head mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; @@ -86,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt) } extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); -extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); extern int __legitimize_mnt(struct vfsmount *, unsigned); extern bool legitimize_mnt(struct vfsmount *, unsigned); diff --git a/fs/mpage.c b/fs/mpage.c index 1480d3a18037..6ade29b19494 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -111,7 +111,7 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) SetPageUptodate(page); return; } - create_empty_buffers(page, 1 << inode->i_blkbits, 0); + create_empty_buffers(page, i_blocksize(inode), 0); } head = page_buffers(page); page_bh = head; diff --git a/fs/namei.c b/fs/namei.c index d8ee4da93650..3f96ae087488 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -887,6 +887,7 @@ static inline int may_follow_link(struct nameidata *nd) { const struct inode *inode; const struct inode *parent; + kuid_t puid; if (!sysctl_protected_symlinks) return 0; @@ -902,7 +903,8 @@ static inline int may_follow_link(struct nameidata *nd) return 0; /* Allowed if parent directory and link owner match. */ - if (uid_eq(parent->i_uid, inode->i_uid)) + puid = parent->i_uid; + if (uid_valid(puid) && uid_eq(puid, inode->i_uid)) return 0; if (nd->flags & LOOKUP_RCU) @@ -2906,22 +2908,10 @@ no_open: dentry = lookup_real(dir, dentry, nd->flags); if (IS_ERR(dentry)) return PTR_ERR(dentry); - - if (create_error) { - int open_flag = op->open_flag; - - error = create_error; - if ((open_flag & O_EXCL)) { - if (!dentry->d_inode) - goto out; - } else if (!dentry->d_inode) { - goto out; - } else if ((open_flag & O_TRUNC) && - d_is_reg(dentry)) { - goto out; - } - /* will fail later, go on to get the right error */ - } + } + if (create_error && !dentry->d_inode) { + error = create_error; + goto out; } looked_up: path->dentry = dentry; @@ -4189,13 +4179,17 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, { int error; bool is_dir = d_is_dir(old_dentry); - const unsigned char *old_name; struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; + struct name_snapshot old_name; - if (source == target) + /* + * Check source == target. + * On overlayfs need to look at underlying inodes. + */ + if (vfs_select_inode(old_dentry, 0) == vfs_select_inode(new_dentry, 0)) return 0; error = may_delete(old_dir, old_dentry, is_dir); @@ -4243,7 +4237,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; - old_name = fsnotify_oldname_init(old_dentry->d_name.name); + take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); if (!is_dir || (flags & RENAME_EXCHANGE)) lock_two_nondirectories(source, target); @@ -4304,14 +4298,14 @@ out: mutex_unlock(&target->i_mutex); dput(new_dentry); if (!error) { - fsnotify_move(old_dir, new_dir, old_name, is_dir, + fsnotify_move(old_dir, new_dir, old_name.name, is_dir, !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); if (flags & RENAME_EXCHANGE) { fsnotify_move(new_dir, old_dir, old_dentry->d_name.name, new_is_dir, NULL, new_dentry); } } - fsnotify_oldname_free(old_name); + release_dentry_name_snapshot(&old_name); return error; } diff --git a/fs/namespace.c b/fs/namespace.c index 0570729c87fd..ec4078d16eb7 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -27,6 +27,9 @@ #include "pnode.h" #include "internal.h" +/* Maximum number of mounts in a mount namespace */ +unsigned int sysctl_mount_max __read_mostly = 100000; + static unsigned int m_hash_mask __read_mostly; static unsigned int m_hash_shift __read_mostly; static unsigned int mp_hash_mask __read_mostly; @@ -234,6 +237,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); + INIT_LIST_HEAD(&mnt->mnt_umounting); #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); #endif @@ -638,28 +642,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) } /* - * find the last mount at @dentry on vfsmount @mnt. - * mount_lock must be held. - */ -struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) -{ - struct mount *p, *res = NULL; - p = __lookup_mnt(mnt, dentry); - if (!p) - goto out; - if (!(p->mnt.mnt_flags & MNT_UMOUNT)) - res = p; - hlist_for_each_entry_continue(p, mnt_hash) { - if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) - break; - if (!(p->mnt.mnt_flags & MNT_UMOUNT)) - res = p; - } -out: - return res; -} - -/* * lookup_mnt - Return the first child mount mounted at path * * "First" means first mounted chronologically. If you create the @@ -743,26 +725,50 @@ static struct mountpoint *lookup_mountpoint(struct dentry *dentry) return NULL; } -static struct mountpoint *new_mountpoint(struct dentry *dentry) +static struct mountpoint *get_mountpoint(struct dentry *dentry) { - struct hlist_head *chain = mp_hash(dentry); - struct mountpoint *mp; + struct mountpoint *mp, *new = NULL; int ret; - mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); - if (!mp) + if (d_mountpoint(dentry)) { +mountpoint: + read_seqlock_excl(&mount_lock); + mp = lookup_mountpoint(dentry); + read_sequnlock_excl(&mount_lock); + if (mp) + goto done; + } + + if (!new) + new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); + if (!new) return ERR_PTR(-ENOMEM); + + /* Exactly one processes may set d_mounted */ ret = d_set_mounted(dentry); - if (ret) { - kfree(mp); - return ERR_PTR(ret); - } - mp->m_dentry = dentry; - mp->m_count = 1; - hlist_add_head(&mp->m_hash, chain); - INIT_HLIST_HEAD(&mp->m_list); + /* Someone else set d_mounted? */ + if (ret == -EBUSY) + goto mountpoint; + + /* The dentry is not available as a mountpoint? */ + mp = ERR_PTR(ret); + if (ret) + goto done; + + /* Add the new mountpoint to the hash table */ + read_seqlock_excl(&mount_lock); + new->m_dentry = dentry; + new->m_count = 1; + hlist_add_head(&new->m_hash, mp_hash(dentry)); + INIT_HLIST_HEAD(&new->m_list); + read_sequnlock_excl(&mount_lock); + + mp = new; + new = NULL; +done: + kfree(new); return mp; } @@ -855,6 +861,13 @@ void mnt_set_mountpoint(struct mount *mnt, hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } +static void __attach_mnt(struct mount *mnt, struct mount *parent) +{ + hlist_add_head_rcu(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); +} + /* * vfsmount lock must be held for write */ @@ -863,28 +876,45 @@ static void attach_mnt(struct mount *mnt, struct mountpoint *mp) { mnt_set_mountpoint(parent, mp, mnt); - hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); - list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); + __attach_mnt(mnt, parent); } -static void attach_shadowed(struct mount *mnt, - struct mount *parent, - struct mount *shadows) +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) { - if (shadows) { - hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); - list_add(&mnt->mnt_child, &shadows->mnt_child); - } else { - hlist_add_head_rcu(&mnt->mnt_hash, - m_hash(&parent->mnt, mnt->mnt_mountpoint)); - list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); - } + struct mountpoint *old_mp = mnt->mnt_mp; + struct dentry *old_mountpoint = mnt->mnt_mountpoint; + struct mount *old_parent = mnt->mnt_parent; + + list_del_init(&mnt->mnt_child); + hlist_del_init(&mnt->mnt_mp_list); + hlist_del_init_rcu(&mnt->mnt_hash); + + attach_mnt(mnt, parent, mp); + + put_mountpoint(old_mp); + + /* + * Safely avoid even the suggestion this code might sleep or + * lock the mount hash by taking advantage of the knowledge that + * mnt_change_mountpoint will not release the final reference + * to a mountpoint. + * + * During mounting, the mount passed in as the parent mount will + * continue to use the old mountpoint and during unmounting, the + * old mountpoint will continue to exist until namespace_unlock, + * which happens well after mnt_change_mountpoint. + */ + spin_lock(&old_mountpoint->d_lock); + old_mountpoint->d_lockref.count--; + spin_unlock(&old_mountpoint->d_lock); + + mnt_add_count(old_parent, -1); } /* * vfsmount lock must be held for write */ -static void commit_tree(struct mount *mnt, struct mount *shadows) +static void commit_tree(struct mount *mnt) { struct mount *parent = mnt->mnt_parent; struct mount *m; @@ -899,7 +929,10 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) list_splice(&head, n->list.prev); - attach_shadowed(mnt, parent, shadows); + n->mounts += n->pending_mounts; + n->pending_mounts = 0; + + __attach_mnt(mnt, parent); touch_mnt_namespace(n); } @@ -1419,11 +1452,16 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) propagate_umount(&tmp_list); while (!list_empty(&tmp_list)) { + struct mnt_namespace *ns; bool disconnect; p = list_first_entry(&tmp_list, struct mount, mnt_list); list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); - __touch_mnt_namespace(p->mnt_ns); + ns = p->mnt_ns; + if (ns) { + ns->mounts--; + __touch_mnt_namespace(ns); + } p->mnt_ns = NULL; if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; @@ -1557,11 +1595,12 @@ void __detach_mounts(struct dentry *dentry) struct mount *mnt; namespace_lock(); + lock_mount_hash(); mp = lookup_mountpoint(dentry); if (IS_ERR_OR_NULL(mp)) goto out_unlock; - lock_mount_hash(); + event++; while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { @@ -1570,9 +1609,9 @@ void __detach_mounts(struct dentry *dentry) } else umount_tree(mnt, UMOUNT_CONNECTED); } - unlock_mount_hash(); put_mountpoint(mp); out_unlock: + unlock_mount_hash(); namespace_unlock(); } @@ -1693,7 +1732,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, continue; for (s = r; s; s = next_mnt(s, r)) { - struct mount *t = NULL; if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(s)) { s = skip_mnt_tree(s); @@ -1715,14 +1753,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, goto out; lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); - mnt_set_mountpoint(parent, p->mnt_mp, q); - if (!list_empty(&parent->mnt_mounts)) { - t = list_last_entry(&parent->mnt_mounts, - struct mount, mnt_child); - if (t->mnt_mp != p->mnt_mp) - t = NULL; - } - attach_shadowed(q, parent, t); + attach_mnt(q, parent, p->mnt_mp); unlock_mount_hash(); } } @@ -1831,6 +1862,28 @@ static int invent_group_ids(struct mount *mnt, bool recurse) return 0; } +int count_mounts(struct mnt_namespace *ns, struct mount *mnt) +{ + unsigned int max = READ_ONCE(sysctl_mount_max); + unsigned int mounts = 0, old, pending, sum; + struct mount *p; + + for (p = mnt; p; p = next_mnt(p, mnt)) + mounts++; + + old = ns->mounts; + pending = ns->pending_mounts; + sum = old + pending; + if ((old > sum) || + (pending > sum) || + (max < sum) || + (mounts > (max - sum))) + return -ENOSPC; + + ns->pending_mounts = pending + mounts; + return 0; +} + /* * @source_mnt : mount tree to be attached * @nd : place the mount tree @source_mnt is attached @@ -1900,10 +1953,26 @@ static int attach_recursive_mnt(struct mount *source_mnt, struct path *parent_path) { HLIST_HEAD(tree_list); + struct mnt_namespace *ns = dest_mnt->mnt_ns; + struct mountpoint *smp; struct mount *child, *p; struct hlist_node *n; int err; + /* Preallocate a mountpoint in case the new mounts need + * to be tucked under other mounts. + */ + smp = get_mountpoint(source_mnt->mnt.mnt_root); + if (IS_ERR(smp)) + return PTR_ERR(smp); + + /* Is there space to add these mounts to the mount namespace? */ + if (!parent_path) { + err = count_mounts(ns, source_mnt); + if (err) + goto out; + } + if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) @@ -1923,16 +1992,19 @@ static int attach_recursive_mnt(struct mount *source_mnt, touch_mnt_namespace(source_mnt->mnt_ns); } else { mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); - commit_tree(source_mnt, NULL); + commit_tree(source_mnt); } hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); - q = __lookup_mnt_last(&child->mnt_parent->mnt, - child->mnt_mountpoint); - commit_tree(child, q); + q = __lookup_mnt(&child->mnt_parent->mnt, + child->mnt_mountpoint); + if (q) + mnt_change_mountpoint(child, smp, q); + commit_tree(child); } + put_mountpoint(smp); unlock_mount_hash(); return 0; @@ -1940,11 +2012,18 @@ static int attach_recursive_mnt(struct mount *source_mnt, out_cleanup_ids: while (!hlist_empty(&tree_list)) { child = hlist_entry(tree_list.first, struct mount, mnt_hash); + child->mnt_parent->mnt_ns->pending_mounts = 0; umount_tree(child, UMOUNT_SYNC); } unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); out: + ns->pending_mounts = 0; + + read_seqlock_excl(&mount_lock); + put_mountpoint(smp); + read_sequnlock_excl(&mount_lock); + return err; } @@ -1961,9 +2040,7 @@ retry: namespace_lock(); mnt = lookup_mnt(path); if (likely(!mnt)) { - struct mountpoint *mp = lookup_mountpoint(dentry); - if (!mp) - mp = new_mountpoint(dentry); + struct mountpoint *mp = get_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); mutex_unlock(&dentry->d_inode->i_mutex); @@ -1982,7 +2059,11 @@ retry: static void unlock_mount(struct mountpoint *where) { struct dentry *dentry = where->m_dentry; + + read_seqlock_excl(&mount_lock); put_mountpoint(where); + read_sequnlock_excl(&mount_lock); + namespace_unlock(); mutex_unlock(&dentry->d_inode->i_mutex); } @@ -2401,8 +2482,10 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; } if (type->fs_flags & FS_USERNS_VISIBLE) { - if (!fs_fully_visible(type, &mnt_flags)) + if (!fs_fully_visible(type, &mnt_flags)) { + put_filesystem(type); return -EPERM; + } } } @@ -2766,6 +2849,8 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) init_waitqueue_head(&new_ns->poll); new_ns->event = 0; new_ns->user_ns = get_user_ns(user_ns); + new_ns->mounts = 0; + new_ns->pending_mounts = 0; return new_ns; } @@ -2815,6 +2900,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, q = new; while (p) { q->mnt_ns = new_ns; + new_ns->mounts++; if (new_fs) { if (&p->mnt == new_fs->root.mnt) { new_fs->root.mnt = mntget(&q->mnt); @@ -2853,6 +2939,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) struct mount *mnt = real_mount(m); mnt->mnt_ns = new_ns; new_ns->root = mnt; + new_ns->mounts++; list_add(&mnt->mnt_list, &new_ns->list); } else { mntput(m); @@ -3052,9 +3139,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ list_del_init(&new_mnt->mnt_expire); + put_mountpoint(root_mp); unlock_mount_hash(); chroot_fs_refs(&root, &new); - put_mountpoint(root_mp); error = 0; out4: unlock_mount(old_mp); @@ -3236,6 +3323,10 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC) mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC); + /* Don't miss readonly hidden in the superblock flags */ + if (mnt->mnt.mnt_sb->s_flags & MS_RDONLY) + mnt_flags |= MNT_LOCK_READONLY; + /* Verify the mount flags are equal to or more permissive * than the proposed new mount. */ @@ -3262,7 +3353,7 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { struct inode *inode = child->mnt_mountpoint->d_inode; /* Only worry about locked mounts */ - if (!(mnt_flags & MNT_LOCKED)) + if (!(child->mnt.mnt_flags & MNT_LOCKED)) continue; /* Is the directory permanetly empty? */ if (!is_empty_dir_inode(inode)) diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index f31fd0dd92c6..b1daeafbea92 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -121,6 +121,7 @@ config PNFS_FILE_LAYOUT config PNFS_BLOCK tristate depends on NFS_V4_1 && BLK_DEV_DM + depends on 64BIT || LBDAF default NFS_V4 config PNFS_OBJLAYOUT diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index a7f2e6e33305..48efe62e1302 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -261,7 +261,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, } ret = -EPROTONOSUPPORT; - if (minorversion == 0) + if (!IS_ENABLED(CONFIG_NFS_V4_1) || minorversion == 0) ret = nfs4_callback_up_net(serv, net); else if (xprt->ops->bc_up) ret = xprt->ops->bc_up(serv, net); @@ -275,6 +275,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, err_socks: svc_rpcb_cleanup(serv, net); err_bind: + nn->cb_users[minorversion]--; dprintk("NFS: Couldn't create callback socket: err = %d; " "net = %p\n", ret, net); return ret; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 646cdac73488..e2e857affbf2 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -912,7 +912,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r if (hdr_arg.minorversion == 0) { cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) - return rpc_drop_reply; + goto out_invalidcred; } cps.minorversion = hdr_arg.minorversion; @@ -940,6 +940,10 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r nfs_put_client(cps.clp); dprintk("%s: done, status = %u\n", __func__, ntohl(status)); return rpc_success; + +out_invalidcred: + pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n"); + return rpc_autherr_badcred; } /* diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 5166adcfc0fb..7af5eeabc80e 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -41,6 +41,17 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); } +static bool +nfs4_is_valid_delegation(const struct nfs_delegation *delegation, + fmode_t flags) +{ + if (delegation != NULL && (delegation->type & flags) == flags && + !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) && + !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + return true; + return false; +} + static int nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) { @@ -50,8 +61,7 @@ nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL && (delegation->type & flags) == flags && - !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + if (nfs4_is_valid_delegation(delegation, flags)) { if (mark) nfs_mark_delegation_referenced(delegation); ret = 1; @@ -892,7 +902,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); - ret = (delegation != NULL && (delegation->type & flags) == flags); + ret = nfs4_is_valid_delegation(delegation, flags); if (ret) { nfs4_stateid_copy(dst, &delegation->stateid); nfs_mark_delegation_referenced(delegation); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ce5a21861074..348e0a05bd18 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -377,7 +377,7 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, again: timestamp = jiffies; gencount = nfs_inc_attr_generation_counter(); - error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages, + error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { /* We requested READDIRPLUS, but the server doesn't grok it */ @@ -462,7 +462,7 @@ void nfs_force_use_readdirplus(struct inode *dir) { if (!list_empty(&NFS_I(dir)->open_files)) { nfs_advise_use_readdirplus(dir); - nfs_zap_mapping(dir, dir->i_mapping); + invalidate_mapping_pages(dir->i_mapping, 0, -1); } } @@ -560,7 +560,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en count++; if (desc->plus != 0) - nfs_prime_dcache(desc->file->f_path.dentry, entry); + nfs_prime_dcache(file_dentry(desc->file), entry); status = nfs_readdir_add_to_array(entry, page); if (status != 0) @@ -847,24 +847,13 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc) goto out; } -static bool nfs_dir_mapping_need_revalidate(struct inode *dir) -{ - struct nfs_inode *nfsi = NFS_I(dir); - - if (nfs_attribute_cache_expired(dir)) - return true; - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - return true; - return false; -} - /* The file offset position represents the dirent entry number. A last cookie cache takes care of the common case of reading the whole directory. */ static int nfs_readdir(struct file *file, struct dir_context *ctx) { - struct dentry *dentry = file->f_path.dentry; + struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); nfs_readdir_descriptor_t my_desc, *desc = &my_desc; @@ -890,7 +879,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0; nfs_block_sillyrename(dentry); - if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode)) + if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) goto out; @@ -1146,11 +1135,13 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) /* Force a full look up iff the parent directory has changed */ if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) { - - if (nfs_lookup_verify_inode(inode, flags)) { + error = nfs_lookup_verify_inode(inode, flags); + if (error) { if (flags & LOOKUP_RCU) return -ECHILD; - goto out_zap_parent; + if (error == -ESTALE) + goto out_zap_parent; + goto out_error; } goto out_valid; } @@ -1174,8 +1165,10 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) trace_nfs_lookup_revalidate_enter(dir, dentry, flags); error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error); - if (error) + if (error == -ESTALE || error == -ENOENT) goto out_bad; + if (error) + goto out_error; if (nfs_compare_fh(NFS_FH(inode), fhandle)) goto out_bad; if ((error = nfs_refresh_inode(inode, fattr)) != 0) @@ -1531,9 +1524,9 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, err = PTR_ERR(inode); trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); put_nfs_open_context(ctx); + d_drop(dentry); switch (err) { case -ENOENT: - d_drop(dentry); d_add(dentry, NULL); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); break; @@ -2432,6 +2425,20 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) } EXPORT_SYMBOL_GPL(nfs_may_open); +static int nfs_execute_ok(struct inode *inode, int mask) +{ + struct nfs_server *server = NFS_SERVER(inode); + int ret; + + if (mask & MAY_NOT_BLOCK) + ret = nfs_revalidate_inode_rcu(server, inode); + else + ret = nfs_revalidate_inode(server, inode); + if (ret == 0 && !execute_ok(inode)) + ret = -EACCES; + return ret; +} + int nfs_permission(struct inode *inode, int mask) { struct rpc_cred *cred; @@ -2449,6 +2456,9 @@ int nfs_permission(struct inode *inode, int mask) case S_IFLNK: goto out; case S_IFREG: + if ((mask & MAY_OPEN) && + nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)) + return 0; break; case S_IFDIR: /* @@ -2481,8 +2491,8 @@ force_lookup: res = PTR_ERR(cred); } out: - if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) - res = -EACCES; + if (!res && (mask & MAY_EXEC)) + res = nfs_execute_ok(inode, mask); dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n", inode->i_sb->s_id, inode->i_ino, mask, res); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 93e236429c5d..dc875cd0e11d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -407,7 +407,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, */ if (!PageUptodate(page)) { unsigned pglen = nfs_page_length(page); - unsigned end = offset + len; + unsigned end = offset + copied; if (pglen == 0) { zero_user_segments(page, 0, offset, diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 02ec07973bc4..fd8da630fd22 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -374,8 +374,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (data->verf.committed == NFS_UNSTABLE) - pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 4946ef40ba87..85ef38f9765f 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -283,7 +283,8 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) s->nfs_client->cl_rpcclient->cl_auth->au_flavor); out_test_devid: - if (filelayout_test_devid_unavailable(devid)) + if (ret->ds_clp == NULL || + filelayout_test_devid_unavailable(devid)) ret = NULL; out: return ret; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 2a2e2d8ddee5..54313322ee5b 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1414,8 +1414,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (data->verf.committed == NFS_UNSTABLE - && ff_layout_need_layoutcommit(data->lseg)) + if (ff_layout_need_layoutcommit(data->lseg)) pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e125e55de86d..2603d7589946 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -30,6 +30,7 @@ void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { nfs4_print_deviceid(&mirror_ds->id_node.deviceid); nfs4_pnfs_ds_put(mirror_ds->ds); + kfree(mirror_ds->ds_versions); kfree_rcu(mirror_ds, id_node.rcu); } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 3e2071a177fd..668ac19af58f 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -927,7 +927,7 @@ int nfs_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; - ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); + ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode); if (IS_ERR(ctx)) return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); @@ -1241,9 +1241,9 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return 0; /* Has the inode gone and changed behind our back? */ if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) - return -EIO; + return -ESTALE; if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) - return -EIO; + return -ESTALE; if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9dea85f7f918..578350fd96e1 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -243,7 +243,6 @@ int nfs_iocounter_wait(struct nfs_io_counter *c); extern const struct nfs_pageio_ops nfs_pgio_rw_ops; struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); void nfs_pgio_header_free(struct nfs_pgio_header *); -void nfs_pgio_data_destroy(struct nfs_pgio_header *); int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops, diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6b1ce9825430..7f1a0fb8c493 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -269,6 +269,7 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, task = rpc_run_task(&task_setup); if (IS_ERR(task)) return PTR_ERR(task); + rpc_put_task(task); return 0; } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index db9b5fea5b3e..679e003818b1 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -26,7 +26,7 @@ static int nfs4_file_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; - struct dentry *dentry = filp->f_path.dentry; + struct dentry *dentry = file_dentry(filp); struct dentry *parent = NULL; struct inode *dir; unsigned openflags = filp->f_flags; @@ -57,7 +57,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) parent = dget_parent(dentry); dir = d_inode(parent); - ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); + ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode); err = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 98a44157353a..8e425f2c5ddd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2188,8 +2188,6 @@ static int nfs4_opendata_access(struct rpc_cred *cred, if ((mask & ~cache.mask & (MAY_READ | MAY_EXEC)) == 0) return 0; - /* even though OPEN succeeded, access is denied. Close the file */ - nfs4_close_state(state, fmode); return -EACCES; } @@ -2422,7 +2420,8 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, sattr->ia_valid |= ATTR_MTIME; /* Except MODE, it seems harmless of setting twice. */ - if ((attrset[1] & FATTR4_WORD1_MODE)) + if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE && + attrset[1] & FATTR4_WORD1_MODE) sattr->ia_valid &= ~ATTR_MODE; if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL) @@ -2451,6 +2450,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, ret = PTR_ERR(state); if (IS_ERR(state)) goto out; + ctx->state = state; if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); @@ -2473,7 +2473,6 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, if (ret != 0) goto out; - ctx->state = state; if (d_inode(dentry) == state->inode) { nfs_inode_attach_open_context(ctx); if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) @@ -2854,12 +2853,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) call_close |= is_wronly; else if (is_wronly) calldata->arg.fmode |= FMODE_WRITE; + if (calldata->arg.fmode != (FMODE_READ|FMODE_WRITE)) + call_close |= is_rdwr; } else if (is_rdwr) calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; - if (calldata->arg.fmode == 0) - call_close |= is_rdwr; - if (!nfs4_valid_open_stateid(state)) call_close = 0; spin_unlock(&state->owner->so_lock); @@ -4711,7 +4709,7 @@ out: */ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) { - struct page *pages[NFS4ACL_MAXPAGES] = {NULL, }; + struct page *pages[NFS4ACL_MAXPAGES + 1] = {NULL, }; struct nfs_getaclargs args = { .fh = NFS_FH(inode), .acl_pages = pages, @@ -4725,13 +4723,9 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu .rpc_argp = &args, .rpc_resp = &res, }; - unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); + unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE) + 1; int ret = -ENOMEM, i; - /* As long as we're doing a round trip to the server anyway, - * let's be prepared for a page of acl data. */ - if (npages == 0) - npages = 1; if (npages > ARRAY_SIZE(pages)) return -ERANGE; @@ -7425,12 +7419,20 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); trace_nfs4_create_session(clp, status); + switch (status) { + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_DELAY: + case -ETIMEDOUT: + case -EACCES: + case -EAGAIN: + goto out; + }; + + clp->cl_seqid++; if (!status) { /* Verify the session's negotiated channel_attrs values */ status = nfs4_verify_channel_attrs(&args, &res); /* Increment the clientid slot sequence id */ - if (clp->cl_seqid == res.seqid) - clp->cl_seqid++; if (status) goto out; nfs4_update_session(session, &res); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index d854693a15b0..e8d1d6c5000c 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1072,6 +1072,7 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) case -NFS4ERR_BADXDR: case -NFS4ERR_RESOURCE: case -NFS4ERR_NOFILEHANDLE: + case -NFS4ERR_MOVED: /* Non-seqid mutating errors */ return; }; @@ -1493,6 +1494,9 @@ restart: __func__, status); case -ENOENT: case -ENOMEM: + case -EACCES: + case -EROFS: + case -EIO: case -ESTALE: /* Open state on this file cannot be recovered */ nfs4_state_mark_recovery_failed(state, status); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 4e4441216804..1cb50bb898b0 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2487,7 +2487,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); - replen = hdr.replen + op_decode_hdr_maxsz + 1; + replen = hdr.replen + op_decode_hdr_maxsz; encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); xdr_inline_pages(&req->rq_rcv_buf, replen << 2, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 452a011ba0d8..8ebfdd00044b 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -528,16 +528,6 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops) } EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc); -/* - * nfs_pgio_header_free - Free a read or write header - * @hdr: The header to free - */ -void nfs_pgio_header_free(struct nfs_pgio_header *hdr) -{ - hdr->rw_ops->rw_free_header(hdr); -} -EXPORT_SYMBOL_GPL(nfs_pgio_header_free); - /** * nfs_pgio_data_destroy - make @hdr suitable for reuse * @@ -546,14 +536,24 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free); * * @hdr: A header that has had nfs_generic_pgio called */ -void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) +static void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) { if (hdr->args.context) put_nfs_open_context(hdr->args.context); if (hdr->page_array.pagevec != hdr->page_array.page_array) kfree(hdr->page_array.pagevec); } -EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy); + +/* + * nfs_pgio_header_free - Free a read or write header + * @hdr: The header to free + */ +void nfs_pgio_header_free(struct nfs_pgio_header *hdr) +{ + nfs_pgio_data_destroy(hdr); + hdr->rw_ops->rw_free_header(hdr); +} +EXPORT_SYMBOL_GPL(nfs_pgio_header_free); /** * nfs_pgio_rpcsetup - Set up arguments for a pageio call @@ -671,7 +671,6 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, u32 midx; set_bit(NFS_IOHDR_REDO, &hdr->flags); - nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); /* TODO: Make sure it's right to clean up all mirrors here * and not just hdr->pgio_mirror_idx */ @@ -689,7 +688,6 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, static void nfs_pgio_release(void *calldata) { struct nfs_pgio_header *hdr = calldata; - nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index bec0384499f7..7af7bedd7c02 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -365,6 +365,9 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo, static bool pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) { + /* Serialise LAYOUTGET/LAYOUTRETURN */ + if (atomic_read(&lo->plh_outstanding) != 0) + return false; if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return false; lo->plh_return_iomode = 0; @@ -1182,13 +1185,11 @@ bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) * i_lock */ spin_lock(&ino->i_lock); lo = nfsi->layout; - if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); sleep = true; + } spin_unlock(&ino->i_lock); - - if (sleep) - rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); - return sleep; } @@ -1530,6 +1531,7 @@ pnfs_update_layout(struct inode *ino, goto out; lookup_again: + nfs4_client_recover_expired_lease(clp); first = false; spin_lock(&ino->i_lock); lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); @@ -1941,7 +1943,6 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_write_mds(desc); mirror->pg_recoalesce = 1; } - nfs_pgio_data_destroy(hdr); hdr->release(hdr); } @@ -2057,7 +2058,6 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_read_mds(desc); mirror->pg_recoalesce = 1; } - nfs_pgio_data_destroy(hdr); hdr->release(hdr); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7b9316406930..7a9b6e347249 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1261,6 +1261,9 @@ int nfs_updatepage(struct file *file, struct page *page, dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", file, count, (long long)(page_file_offset(page) + offset)); + if (!count) + goto out; + if (nfs_can_extend_write(file, page, inode)) { count = max(count + offset, nfs_page_length(page)); offset = 0; @@ -1271,7 +1274,7 @@ int nfs_updatepage(struct file *file, struct page *page, nfs_set_pageerror(page); else __set_page_dirty_nobuffers(page); - +out: dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); return status; diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index c29d9421bd5e..0976f8dad4ce 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -50,7 +50,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, { struct nfsd4_layout_seg *seg = &args->lg_seg; struct super_block *sb = inode->i_sb; - u32 block_size = (1 << inode->i_blkbits); + u32 block_size = i_blocksize(inode); struct pnfs_block_extent *bex; struct iomap iomap; u32 device_generation = 0; @@ -151,7 +151,7 @@ nfsd4_block_proc_layoutcommit(struct inode *inode, int error; nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, - lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits); + lcp->lc_up_len, &iomaps, i_blocksize(inode)); if (nr_iomaps < 0) return nfserrno(nr_iomaps); diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 1580ea6fd64d..d08cd88155c7 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -104,22 +104,21 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp, goto out; inode = d_inode(fh->fh_dentry); - if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { - error = -EOPNOTSUPP; - goto out_errno; - } error = fh_want_write(fh); if (error) goto out_errno; - error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS); + fh_lock(fh); + + error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access); if (error) - goto out_drop_write; - error = inode->i_op->set_acl(inode, argp->acl_default, - ACL_TYPE_DEFAULT); + goto out_drop_lock; + error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default); if (error) - goto out_drop_write; + goto out_drop_lock; + + fh_unlock(fh); fh_drop_write(fh); @@ -131,7 +130,8 @@ out: posix_acl_release(argp->acl_access); posix_acl_release(argp->acl_default); return nfserr; -out_drop_write: +out_drop_lock: + fh_unlock(fh); fh_drop_write(fh); out_errno: nfserr = nfserrno(error); diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 01df4cd7c753..0c890347cde3 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -95,22 +95,20 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp, goto out; inode = d_inode(fh->fh_dentry); - if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { - error = -EOPNOTSUPP; - goto out_errno; - } error = fh_want_write(fh); if (error) goto out_errno; - error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS); + fh_lock(fh); + + error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access); if (error) - goto out_drop_write; - error = inode->i_op->set_acl(inode, argp->acl_default, - ACL_TYPE_DEFAULT); + goto out_drop_lock; + error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default); -out_drop_write: +out_drop_lock: + fh_unlock(fh); fh_drop_write(fh); out_errno: nfserr = nfserrno(error); diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 00575d776d91..7162ab7bc093 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -358,6 +358,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, { unsigned int len, v, hdr, dlen; u32 max_blocksize = svc_max_payload(rqstp); + struct kvec *head = rqstp->rq_arg.head; p = decode_fh(p, &args->fh); if (!p) @@ -367,6 +368,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, args->count = ntohl(*p++); args->stable = ntohl(*p++); len = args->len = ntohl(*p++); + if ((void *)p > head->iov_base + head->iov_len) + return 0; /* * The count must equal the amount of data passed. */ @@ -377,9 +380,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, * Check to make sure that we got the right number of * bytes. */ - hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; - dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len - - hdr; + hdr = (void*)p - head->iov_base; + dlen = head->iov_len + rqstp->rq_arg.page_len - hdr; /* * Round the length of the data which was specified up to * the next multiple of XDR units and then compare that @@ -396,7 +398,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, len = args->len = max_blocksize; } rqstp->rq_vec[0].iov_base = (void*)p; - rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; + rqstp->rq_vec[0].iov_len = head->iov_len - hdr; v = 0; while (len > rqstp->rq_vec[v].iov_len) { len -= rqstp->rq_vec[v].iov_len; @@ -471,6 +473,8 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, /* first copy and check from the first page */ old = (char*)p; vec = &rqstp->rq_arg.head[0]; + if ((void *)old > vec->iov_base + vec->iov_len) + return 0; avail = vec->iov_len - (old - (char*)vec->iov_base); while (len && avail && *old) { *new++ = *old++; diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 6adabd6049b7..71292a0d6f09 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -770,9 +770,6 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; inode = d_inode(dentry); - if (!inode->i_op->set_acl || !IS_POSIXACL(inode)) - return nfserr_attrnotsupp; - if (S_ISDIR(inode->i_mode)) flags = NFS4_ACL_DIR; @@ -782,16 +779,19 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, if (host_error < 0) goto out_nfserr; - host_error = inode->i_op->set_acl(inode, pacl, ACL_TYPE_ACCESS); + fh_lock(fhp); + + host_error = set_posix_acl(inode, ACL_TYPE_ACCESS, pacl); if (host_error < 0) - goto out_release; + goto out_drop_lock; if (S_ISDIR(inode->i_mode)) { - host_error = inode->i_op->set_acl(inode, dpacl, - ACL_TYPE_DEFAULT); + host_error = set_posix_acl(inode, ACL_TYPE_DEFAULT, dpacl); } -out_release: +out_drop_lock: + fh_unlock(fhp); + posix_acl_release(pacl); posix_acl_release(dpacl); out_nfserr: diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index e7f50c4081d6..15bdc2d48cfe 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -710,22 +710,6 @@ static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc } } -static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args) -{ - struct rpc_xprt *xprt; - - if (args->protocol != XPRT_TRANSPORT_BC_TCP) - return rpc_create(args); - - xprt = args->bc_xprt->xpt_bc_xprt; - if (xprt) { - xprt_get(xprt); - return rpc_create_xprt(args, xprt); - } - - return rpc_create(args); -} - static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { int maxtime = max_cb_time(clp->net); @@ -768,7 +752,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c args.authflavor = ses->se_cb_sec.flavor; } /* Create RPC client */ - client = create_backchannel_client(&args); + client = rpc_create(&args); if (IS_ERR(client)) { dprintk("NFSD: couldn't create callback client: %ld\n", PTR_ERR(client)); diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index c9d6c715c0fb..9eed219f57a5 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -189,10 +189,11 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, struct nfs4_layout_stateid *ls; struct nfs4_stid *stp; - stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache); + stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache, + nfsd4_free_layout_stateid); if (!stp) return NULL; - stp->sc_free = nfsd4_free_layout_stateid; + get_nfs4_file(fp); stp->sc_file = fp; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a9f096c7e99f..209dbfc50cd4 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -877,6 +877,7 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &exp, &dentry); if (err) return err; + fh_unlock(&cstate->current_fh); if (d_really_is_negative(dentry)) { exp_put(exp); err = nfserr_noent; @@ -1689,6 +1690,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, opdesc->op_get_currentstateid(cstate, &op->u); op->status = opdesc->op_func(rqstp, cstate, &op->u); + /* Only from SEQUENCE */ + if (cstate->status == nfserr_replay_cache) { + dprintk("%s NFS4.1 replay from cache\n", __func__); + status = op->status; + goto out; + } if (!op->status) { if (opdesc->op_set_currentstateid) opdesc->op_set_currentstateid(cstate, &op->u); @@ -1699,14 +1706,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, if (need_wrongsec_check(rqstp)) op->status = check_nfsd_access(current_fh->fh_export, rqstp); } - encode_op: - /* Only from SEQUENCE */ - if (cstate->status == nfserr_replay_cache) { - dprintk("%s NFS4.1 replay from cache\n", __func__); - status = op->status; - goto out; - } if (op->status == nfserr_replay_me) { op->replay = &cstate->replay_owner->so_replay; nfsd4_encode_replay(&resp->xdr, op); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6b800b5b8fed..c7f1ce41442a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -553,8 +553,8 @@ out: return co; } -struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, - struct kmem_cache *slab) +struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, + void (*sc_free)(struct nfs4_stid *)) { struct nfs4_stid *stid; int new_id; @@ -570,6 +570,8 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, idr_preload_end(); if (new_id < 0) goto out_free; + + stid->sc_free = sc_free; stid->sc_client = cl; stid->sc_stateid.si_opaque.so_id = new_id; stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid; @@ -595,15 +597,12 @@ out_free: static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) { struct nfs4_stid *stid; - struct nfs4_ol_stateid *stp; - stid = nfs4_alloc_stid(clp, stateid_slab); + stid = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_ol_stateid); if (!stid) return NULL; - stp = openlockstateid(stid); - stp->st_stid.sc_free = nfs4_free_ol_stateid; - return stp; + return openlockstateid(stid); } static void nfs4_free_deleg(struct nfs4_stid *stid) @@ -701,11 +700,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh, goto out_dec; if (delegation_blocked(¤t_fh->fh_handle)) goto out_dec; - dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); + dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg)); if (dp == NULL) goto out_dec; - dp->dl_stid.sc_free = nfs4_free_deleg; /* * delegation seqid's are never incremented. The 4.1 special * meaning of seqid 0 isn't meaningful, really, but let's avoid @@ -1200,27 +1198,6 @@ free_ol_stateid_reaplist(struct list_head *reaplist) } } -static void release_lockowner(struct nfs4_lockowner *lo) -{ - struct nfs4_client *clp = lo->lo_owner.so_client; - struct nfs4_ol_stateid *stp; - struct list_head reaplist; - - INIT_LIST_HEAD(&reaplist); - - spin_lock(&clp->cl_lock); - unhash_lockowner_locked(lo); - while (!list_empty(&lo->lo_owner.so_stateids)) { - stp = list_first_entry(&lo->lo_owner.so_stateids, - struct nfs4_ol_stateid, st_perstateowner); - WARN_ON(!unhash_lock_stateid(stp)); - put_ol_stateid_locked(stp, &reaplist); - } - spin_unlock(&clp->cl_lock); - free_ol_stateid_reaplist(&reaplist); - nfs4_put_stateowner(&lo->lo_owner); -} - static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, struct list_head *reaplist) { @@ -3452,6 +3429,10 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfs4_openowner *oo = open->op_openowner; struct nfs4_ol_stateid *retstp = NULL; + /* We are moving these outside of the spinlocks to avoid the warnings */ + mutex_init(&stp->st_mutex); + mutex_lock(&stp->st_mutex); + spin_lock(&oo->oo_owner.so_client->cl_lock); spin_lock(&fp->fi_lock); @@ -3467,13 +3448,17 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, stp->st_access_bmap = 0; stp->st_deny_bmap = 0; stp->st_openstp = NULL; - init_rwsem(&stp->st_rwsem); list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); out_unlock: spin_unlock(&fp->fi_lock); spin_unlock(&oo->oo_owner.so_client->cl_lock); + if (retstp) { + mutex_lock(&retstp->st_mutex); + /* Not that we need to, just for neatness */ + mutex_unlock(&stp->st_mutex); + } return retstp; } @@ -4300,32 +4285,34 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf */ if (stp) { /* Stateid was found, this is an OPEN upgrade */ - down_read(&stp->st_rwsem); + mutex_lock(&stp->st_mutex); status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) { - up_read(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); goto out; } } else { stp = open->op_stp; open->op_stp = NULL; + /* + * init_open_stateid() either returns a locked stateid + * it found, or initializes and locks the new one we passed in + */ swapstp = init_open_stateid(stp, fp, open); if (swapstp) { nfs4_put_stid(&stp->st_stid); stp = swapstp; - down_read(&stp->st_rwsem); status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) { - up_read(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); goto out; } goto upgrade_out; } - down_read(&stp->st_rwsem); status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); if (status) { - up_read(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); release_open_stateid(stp); goto out; } @@ -4337,7 +4324,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf } upgrade_out: nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid); - up_read(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); if (nfsd4_has_session(&resp->cstate)) { if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) { @@ -4872,6 +4859,32 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfs_ok; } +static __be32 +nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) +{ + struct nfs4_ol_stateid *stp = openlockstateid(s); + __be32 ret; + + mutex_lock(&stp->st_mutex); + + ret = check_stateid_generation(stateid, &s->sc_stateid, 1); + if (ret) + goto out; + + ret = nfserr_locks_held; + if (check_for_locks(stp->st_stid.sc_file, + lockowner(stp->st_stateowner))) + goto out; + + release_lock_stateid(stp); + ret = nfs_ok; + +out: + mutex_unlock(&stp->st_mutex); + nfs4_put_stid(s); + return ret; +} + __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_free_stateid *free_stateid) @@ -4879,7 +4892,6 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid = &free_stateid->fr_stateid; struct nfs4_stid *s; struct nfs4_delegation *dp; - struct nfs4_ol_stateid *stp; struct nfs4_client *cl = cstate->session->se_client; __be32 ret = nfserr_bad_stateid; @@ -4898,18 +4910,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, ret = nfserr_locks_held; break; case NFS4_LOCK_STID: - ret = check_stateid_generation(stateid, &s->sc_stateid, 1); - if (ret) - break; - stp = openlockstateid(s); - ret = nfserr_locks_held; - if (check_for_locks(stp->st_stid.sc_file, - lockowner(stp->st_stateowner))) - break; - WARN_ON(!unhash_lock_stateid(stp)); + atomic_inc(&s->sc_count); spin_unlock(&cl->cl_lock); - nfs4_put_stid(s); - ret = nfs_ok; + ret = nfsd4_free_lock_stateid(stateid, s); goto out; case NFS4_REVOKED_DELEG_STID: dp = delegstateid(s); @@ -4950,12 +4953,12 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ * revoked delegations are kept only for free_stateid. */ return nfserr_bad_stateid; - down_write(&stp->st_rwsem); + mutex_lock(&stp->st_mutex); status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); if (status == nfs_ok) status = nfs4_check_fh(current_fh, &stp->st_stid); if (status != nfs_ok) - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); return status; } @@ -5003,7 +5006,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs return status; oo = openowner(stp->st_stateowner); if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); return nfserr_bad_stateid; } @@ -5035,12 +5038,12 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, oo = openowner(stp->st_stateowner); status = nfserr_bad_stateid; if (oo->oo_flags & NFS4_OO_CONFIRMED) { - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); goto put_stateid; } oo->oo_flags |= NFS4_OO_CONFIRMED; nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid); - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid)); @@ -5116,7 +5119,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid); status = nfs_ok; put_stateid: - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); @@ -5169,7 +5172,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); nfsd4_close_open_stateid(stp); @@ -5391,11 +5394,10 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); get_nfs4_file(fp); stp->st_stid.sc_file = fp; - stp->st_stid.sc_free = nfs4_free_lock_stateid; stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; - init_rwsem(&stp->st_rwsem); + mutex_init(&stp->st_mutex); list_add(&stp->st_locks, &open_stp->st_locks); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); spin_lock(&fp->fi_lock); @@ -5434,7 +5436,7 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, lst = find_lock_stateid(lo, fi); if (lst == NULL) { spin_unlock(&clp->cl_lock); - ns = nfs4_alloc_stid(clp, stateid_slab); + ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid); if (ns == NULL) return NULL; @@ -5476,7 +5478,7 @@ static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, - struct nfs4_ol_stateid **lst, bool *new) + struct nfs4_ol_stateid **plst, bool *new) { __be32 status; struct nfs4_file *fi = ost->st_stid.sc_file; @@ -5484,7 +5486,9 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_client *cl = oo->oo_owner.so_client; struct inode *inode = d_inode(cstate->current_fh.fh_dentry); struct nfs4_lockowner *lo; + struct nfs4_ol_stateid *lst; unsigned int strhashval; + bool hashed; lo = find_lockowner_str(cl, &lock->lk_new_owner); if (!lo) { @@ -5500,12 +5504,27 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, goto out; } - *lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); - if (*lst == NULL) { +retry: + lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); + if (lst == NULL) { status = nfserr_jukebox; goto out; } + + mutex_lock(&lst->st_mutex); + + /* See if it's still hashed to avoid race with FREE_STATEID */ + spin_lock(&cl->cl_lock); + hashed = !list_empty(&lst->st_perfile); + spin_unlock(&cl->cl_lock); + + if (!hashed) { + mutex_unlock(&lst->st_mutex); + nfs4_put_stid(&lst->st_stid); + goto retry; + } status = nfs_ok; + *plst = lst; out: nfs4_put_stateowner(&lo->lo_owner); return status; @@ -5564,7 +5583,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &open_stp, nn); if (status) goto out; - up_write(&open_stp->st_rwsem); + mutex_unlock(&open_stp->st_mutex); open_sop = openowner(open_stp->st_stateowner); status = nfserr_bad_stateid; if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid, @@ -5572,8 +5591,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; status = lookup_or_create_lock_state(cstate, open_stp, lock, &lock_stp, &new); - if (status == nfs_ok) - down_write(&lock_stp->st_rwsem); } else { status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, @@ -5677,7 +5694,7 @@ out: seqid_mutating_err(ntohl(status))) lock_sop->lo_owner.so_seqid++; - up_write(&lock_stp->st_rwsem); + mutex_unlock(&lock_stp->st_mutex); /* * If this is a new, never-before-used stateid, and we are @@ -5847,7 +5864,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fput: fput(filp); put_stateid: - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); @@ -5911,6 +5928,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct nfs4_client *clp; + LIST_HEAD (reaplist); dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); @@ -5941,9 +5959,23 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, nfs4_get_stateowner(sop); break; } + if (!lo) { + spin_unlock(&clp->cl_lock); + return status; + } + + unhash_lockowner_locked(lo); + while (!list_empty(&lo->lo_owner.so_stateids)) { + stp = list_first_entry(&lo->lo_owner.so_stateids, + struct nfs4_ol_stateid, + st_perstateowner); + WARN_ON(!unhash_lock_stateid(stp)); + put_ol_stateid_locked(stp, &reaplist); + } spin_unlock(&clp->cl_lock); - if (lo) - release_lockowner(lo); + free_ol_stateid_reaplist(&reaplist); + nfs4_put_stateowner(&lo->lo_owner); + return status; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 51c9e9ca39a4..544672b440de 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -129,7 +129,7 @@ static void next_decode_page(struct nfsd4_compoundargs *argp) argp->p = page_address(argp->pagelist[0]); argp->pagelist++; if (argp->pagelen < PAGE_SIZE) { - argp->end = argp->p + (argp->pagelen>>2); + argp->end = argp->p + XDR_QUADLEN(argp->pagelen); argp->pagelen = 0; } else { argp->end = argp->p + (PAGE_SIZE>>2); @@ -1072,8 +1072,9 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename READ_BUF(4); rename->rn_snamelen = be32_to_cpup(p++); - READ_BUF(rename->rn_snamelen + 4); + READ_BUF(rename->rn_snamelen); SAVEMEM(rename->rn_sname, rename->rn_snamelen); + READ_BUF(4); rename->rn_tnamelen = be32_to_cpup(p++); READ_BUF(rename->rn_tnamelen); SAVEMEM(rename->rn_tname, rename->rn_tnamelen); @@ -1155,13 +1156,14 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient READ_BUF(8); setclientid->se_callback_prog = be32_to_cpup(p++); setclientid->se_callback_netid_len = be32_to_cpup(p++); - - READ_BUF(setclientid->se_callback_netid_len + 4); + READ_BUF(setclientid->se_callback_netid_len); SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len); + READ_BUF(4); setclientid->se_callback_addr_len = be32_to_cpup(p++); - READ_BUF(setclientid->se_callback_addr_len + 4); + READ_BUF(setclientid->se_callback_addr_len); SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len); + READ_BUF(4); setclientid->se_callback_ident = be32_to_cpup(p++); DECODE_TAIL; @@ -1244,9 +1246,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) argp->pagelen -= pages * PAGE_SIZE; len -= pages * PAGE_SIZE; - argp->p = (__be32 *)page_address(argp->pagelist[0]); - argp->pagelist++; - argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE); + next_decode_page(argp); } argp->p += XDR_QUADLEN(len); @@ -1815,8 +1815,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) READ_BUF(4); argp->taglen = be32_to_cpup(p++); - READ_BUF(argp->taglen + 8); + READ_BUF(argp->taglen); SAVEMEM(argp->tag, argp->taglen); + READ_BUF(8); argp->minorversion = be32_to_cpup(p++); argp->opcnt = be32_to_cpup(p++); max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2); @@ -2750,9 +2751,16 @@ out_acl: } #endif /* CONFIG_NFSD_PNFS */ if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { - status = nfsd4_encode_bitmap(xdr, NFSD_SUPPATTR_EXCLCREAT_WORD0, - NFSD_SUPPATTR_EXCLCREAT_WORD1, - NFSD_SUPPATTR_EXCLCREAT_WORD2); + u32 supp[3]; + + supp[0] = nfsd_suppattrs0(minorversion); + supp[1] = nfsd_suppattrs1(minorversion); + supp[2] = nfsd_suppattrs2(minorversion); + supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0; + supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1; + supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2; + + status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]); if (status) goto out; } @@ -4038,8 +4046,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getdeviceinfo *gdev) { struct xdr_stream *xdr = &resp->xdr; - const struct nfsd4_layout_ops *ops = - nfsd4_layout_ops[gdev->gd_layout_type]; + const struct nfsd4_layout_ops *ops; u32 starting_len = xdr->buf->len, needed_len; __be32 *p; @@ -4056,6 +4063,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, /* If maxcount is 0 then just update notifications */ if (gdev->gd_maxcount != 0) { + ops = nfsd4_layout_ops[gdev->gd_layout_type]; nfserr = ops->encode_getdeviceinfo(xdr, gdev); if (nfserr) { /* @@ -4108,8 +4116,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_layoutget *lgp) { struct xdr_stream *xdr = &resp->xdr; - const struct nfsd4_layout_ops *ops = - nfsd4_layout_ops[lgp->lg_layout_type]; + const struct nfsd4_layout_ops *ops; __be32 *p; dprintk("%s: err %d\n", __func__, nfserr); @@ -4132,6 +4139,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, *p++ = cpu_to_be32(lgp->lg_seg.iomode); *p++ = cpu_to_be32(lgp->lg_layout_type); + ops = nfsd4_layout_ops[lgp->lg_layout_type]; nfserr = ops->encode_layoutget(xdr, lgp); out: kfree(lgp->lg_content); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index ad4e2377dd63..5be1fa6b676d 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -656,6 +656,37 @@ static __be32 map_new_errors(u32 vers, __be32 nfserr) return nfserr; } +/* + * A write procedure can have a large argument, and a read procedure can + * have a large reply, but no NFSv2 or NFSv3 procedure has argument and + * reply that can both be larger than a page. The xdr code has taken + * advantage of this assumption to be a sloppy about bounds checking in + * some cases. Pending a rewrite of the NFSv2/v3 xdr code to fix that + * problem, we enforce these assumptions here: + */ +static bool nfs_request_too_big(struct svc_rqst *rqstp, + struct svc_procedure *proc) +{ + /* + * The ACL code has more careful bounds-checking and is not + * susceptible to this problem: + */ + if (rqstp->rq_prog != NFS_PROGRAM) + return false; + /* + * Ditto NFSv4 (which can in theory have argument and reply both + * more than a page): + */ + if (rqstp->rq_vers >= 4) + return false; + /* The reply will be small, we're OK: */ + if (proc->pc_xdrressize > 0 && + proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE)) + return false; + + return rqstp->rq_arg.len > PAGE_SIZE; +} + int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) { @@ -668,6 +699,11 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) rqstp->rq_vers, rqstp->rq_proc); proc = rqstp->rq_procinfo; + if (nfs_request_too_big(rqstp, proc)) { + dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers); + *statp = rpc_garbage_args; + return 1; + } /* * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 79d964aa8079..bf913201a6ad 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -280,6 +280,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_writeargs *args) { unsigned int len, hdr, dlen; + struct kvec *head = rqstp->rq_arg.head; int v; p = decode_fh(p, &args->fh); @@ -300,9 +301,10 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, * Check to make sure that we got the right number of * bytes. */ - hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; - dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len - - hdr; + hdr = (void*)p - head->iov_base; + if (hdr > head->iov_len) + return 0; + dlen = head->iov_len + rqstp->rq_arg.page_len - hdr; /* * Round the length of the data which was specified up to @@ -316,7 +318,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, return 0; rqstp->rq_vec[0].iov_base = (void*)p; - rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; + rqstp->rq_vec[0].iov_len = head->iov_len - hdr; v = 0; while (len > rqstp->rq_vec[v].iov_len) { len -= rqstp->rq_vec[v].iov_len; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 77fdf4de91ba..5134eedcb16c 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -535,7 +535,7 @@ struct nfs4_ol_stateid { unsigned char st_access_bmap; unsigned char st_deny_bmap; struct nfs4_ol_stateid *st_openstp; - struct rw_semaphore st_rwsem; + struct mutex st_mutex; }; static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) @@ -583,8 +583,8 @@ extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, struct nfsd_net *nn); -struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, - struct kmem_cache *slab); +struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, + void (*sc_free)(struct nfs4_stid *)); void nfs4_unhash_stid(struct nfs4_stid *s); void nfs4_put_stid(struct nfs4_stid *s); void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 994d66fbb446..91e0c5429b4d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -369,7 +369,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, __be32 err; int host_err; bool get_write_count; - int size_change = 0; + bool size_change = (iap->ia_valid & ATTR_SIZE); if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; @@ -382,11 +382,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, /* Get inode */ err = fh_verify(rqstp, fhp, ftype, accmode); if (err) - goto out; + return err; if (get_write_count) { host_err = fh_want_write(fhp); if (host_err) - return nfserrno(host_err); + goto out; } dentry = fhp->fh_dentry; @@ -397,20 +397,28 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, iap->ia_valid &= ~ATTR_MODE; if (!iap->ia_valid) - goto out; + return 0; nfsd_sanitize_attrs(inode, iap); + if (check_guard && guardtime != inode->i_ctime.tv_sec) + return nfserr_notsync; + /* * The size case is special, it changes the file in addition to the - * attributes. + * attributes, and file systems don't expect it to be mixed with + * "random" attribute changes. We thus split out the size change + * into a separate call to ->setattr, and do the rest as a separate + * setattr call. */ - if (iap->ia_valid & ATTR_SIZE) { + if (size_change) { err = nfsd_get_write_access(rqstp, fhp, iap); if (err) - goto out; - size_change = 1; + return err; + } + fh_lock(fhp); + if (size_change) { /* * RFC5661, Section 18.30.4: * Changing the size of a file with SETATTR indirectly @@ -418,29 +426,36 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, * * (and similar for the older RFCs) */ - if (iap->ia_size != i_size_read(inode)) - iap->ia_valid |= ATTR_MTIME; - } + struct iattr size_attr = { + .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME, + .ia_size = iap->ia_size, + }; - iap->ia_valid |= ATTR_CTIME; + host_err = notify_change(dentry, &size_attr, NULL); + if (host_err) + goto out_unlock; + iap->ia_valid &= ~ATTR_SIZE; - if (check_guard && guardtime != inode->i_ctime.tv_sec) { - err = nfserr_notsync; - goto out_put_write_access; + /* + * Avoid the additional setattr call below if the only other + * attribute that the client sends is the mtime, as we update + * it as part of the size change above. + */ + if ((iap->ia_valid & ~ATTR_MTIME) == 0) + goto out_unlock; } - fh_lock(fhp); + iap->ia_valid |= ATTR_CTIME; host_err = notify_change(dentry, iap, NULL); - fh_unlock(fhp); - err = nfserrno(host_err); -out_put_write_access: +out_unlock: + fh_unlock(fhp); if (size_change) put_write_access(inode); - if (!err) - err = nfserrno(commit_metadata(fhp)); out: - return err; + if (!host_err) + host_err = commit_metadata(fhp); + return nfserrno(host_err); } #if defined(CONFIG_NFSD_V4) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index a35ae35e6932..cd39b57288c2 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -55,7 +55,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) brelse(bh); BUG(); } - memset(bh->b_data, 0, 1 << inode->i_blkbits); + memset(bh->b_data, 0, i_blocksize(inode)); bh->b_bdev = inode->i_sb->s_bdev; bh->b_blocknr = blocknr; set_buffer_mapped(bh); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index ac2f64943ff4..00877ef0b120 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -55,7 +55,7 @@ void nilfs_inode_add_blocks(struct inode *inode, int n) { struct nilfs_root *root = NILFS_I(inode)->i_root; - inode_add_bytes(inode, (1 << inode->i_blkbits) * n); + inode_add_bytes(inode, i_blocksize(inode) * n); if (root) atomic64_add(n, &root->blocks_count); } @@ -64,7 +64,7 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n) { struct nilfs_root *root = NILFS_I(inode)->i_root; - inode_sub_bytes(inode, (1 << inode->i_blkbits) * n); + inode_sub_bytes(inode, i_blocksize(inode) * n); if (root) atomic64_sub(n, &root->blocks_count); } diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 1125f40233ff..612a2457243d 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -60,7 +60,7 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, set_buffer_mapped(bh); kaddr = kmap_atomic(bh->b_page); - memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits); + memset(kaddr + bh_offset(bh), 0, i_blocksize(inode)); if (init_block) init_block(inode, bh, kaddr); flush_dcache_page(bh->b_page); @@ -503,7 +503,7 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size, struct nilfs_mdt_info *mi = NILFS_MDT(inode); mi->mi_entry_size = entry_size; - mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size; + mi->mi_entries_per_block = i_blocksize(inode) / entry_size; mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 3b65adaae7e4..2f27c935bd57 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -719,7 +719,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, lock_page(page); if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << inode->i_blkbits, 0); + create_empty_buffers(page, i_blocksize(inode), 0); unlock_page(page); bh = head = page_buffers(page); diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 69bd801afb53..37e49cb2ac4c 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -443,7 +443,7 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp) if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC) return 0; bytes = le16_to_cpu(sbp->s_bytes); - if (bytes > BLOCK_SIZE) + if (bytes < sumoff + 4 || bytes > BLOCK_SIZE) return 0; crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp, sumoff); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index d2f97ecca6a5..e0e5f7c3c99f 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -67,18 +67,7 @@ static int fanotify_get_response(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - wait_event(group->fanotify_data.access_waitq, event->response || - atomic_read(&group->fanotify_data.bypass_perm)); - - if (!event->response) { /* bypass_perm set */ - /* - * Event was canceled because group is being destroyed. Remove - * it from group's event list because we are responsible for - * freeing the permission event. - */ - fsnotify_remove_event(group, &event->fae.fse); - return 0; - } + wait_event(group->fanotify_data.access_waitq, event->response); /* userspace responded, convert to something usable */ switch (event->response) { diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 8e8e6bcd1d43..a64313868d3a 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -358,16 +358,20 @@ static int fanotify_release(struct inode *ignored, struct file *file) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS struct fanotify_perm_event_info *event, *next; + struct fsnotify_event *fsn_event; /* - * There may be still new events arriving in the notification queue - * but since userspace cannot use fanotify fd anymore, no event can - * enter or leave access_list by now. + * Stop new events from arriving in the notification queue. since + * userspace cannot use fanotify fd anymore, no event can enter or + * leave access_list by now either. */ - spin_lock(&group->fanotify_data.access_lock); - - atomic_inc(&group->fanotify_data.bypass_perm); + fsnotify_group_stop_queueing(group); + /* + * Process all permission events on access_list and notification queue + * and simulate reply from userspace. + */ + spin_lock(&group->fanotify_data.access_lock); list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, fae.fse.list) { pr_debug("%s: found group=%p event=%p\n", __func__, group, @@ -379,12 +383,21 @@ static int fanotify_release(struct inode *ignored, struct file *file) spin_unlock(&group->fanotify_data.access_lock); /* - * Since bypass_perm is set, newly queued events will not wait for - * access response. Wake up the already sleeping ones now. - * synchronize_srcu() in fsnotify_destroy_group() will wait for all - * processes sleeping in fanotify_handle_event() waiting for access - * response and thus also for all permission events to be freed. + * Destroy all non-permission events. For permission events just + * dequeue them and set the response. They will be freed once the + * response is consumed and fanotify_get_response() returns. */ + mutex_lock(&group->notification_mutex); + while (!fsnotify_notify_queue_is_empty(group)) { + fsn_event = fsnotify_remove_first_event(group); + if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) + fsnotify_destroy_event(group, fsn_event); + else + FANOTIFY_PE(fsn_event)->response = FAN_ALLOW; + } + mutex_unlock(&group->notification_mutex); + + /* Response for all permission events it set, wakeup waiters */ wake_up(&group->fanotify_data.access_waitq); #endif @@ -755,7 +768,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) spin_lock_init(&group->fanotify_data.access_lock); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); - atomic_set(&group->fanotify_data.bypass_perm, 0); #endif switch (flags & FAN_ALL_CLASS_BITS) { case FAN_CLASS_NOTIF: diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index db39de2dd4cb..a64adc2fced9 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -104,16 +104,20 @@ int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) if (unlikely(!fsnotify_inode_watches_children(p_inode))) __fsnotify_update_child_dentry_flags(p_inode); else if (p_inode->i_fsnotify_mask & mask) { + struct name_snapshot name; + /* we are notifying a parent so come up with the new mask which * specifies these are events which came from a child. */ mask |= FS_EVENT_ON_CHILD; + take_dentry_name_snapshot(&name, dentry); if (path) ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH, - dentry->d_name.name, 0); + name.name, 0); else ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, - dentry->d_name.name, 0); + name.name, 0); + release_dentry_name_snapshot(&name); } dput(parent); diff --git a/fs/notify/group.c b/fs/notify/group.c index d16b62cb2854..18eb30c6bd8f 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -40,6 +40,17 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) } /* + * Stop queueing new events for this group. Once this function returns + * fsnotify_add_event() will not add any new events to the group's queue. + */ +void fsnotify_group_stop_queueing(struct fsnotify_group *group) +{ + mutex_lock(&group->notification_mutex); + group->shutdown = true; + mutex_unlock(&group->notification_mutex); +} + +/* * Trying to get rid of a group. Remove all marks, flush all events and release * the group reference. * Note that another thread calling fsnotify_clear_marks_by_group() may still @@ -47,6 +58,14 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) */ void fsnotify_destroy_group(struct fsnotify_group *group) { + /* + * Stop queueing new events. The code below is careful enough to not + * require this but fanotify needs to stop queuing events even before + * fsnotify_destroy_group() is called and this makes the other callers + * of fsnotify_destroy_group() to see the same behavior. + */ + fsnotify_group_stop_queueing(group); + /* clear all inode marks for this group */ fsnotify_clear_marks_by_group(group); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index a95d8e037aeb..e455e83ceeeb 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -82,7 +82,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group, * Add an event to the group notification queue. The group can later pull this * event off the queue to deal with. The function returns 0 if the event was * added to the queue, 1 if the event was merged with some other queued event, - * 2 if the queue of events has overflown. + * 2 if the event was not queued - either the queue of events has overflown + * or the group is shutting down. */ int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, @@ -96,6 +97,11 @@ int fsnotify_add_event(struct fsnotify_group *group, mutex_lock(&group->notification_mutex); + if (group->shutdown) { + mutex_unlock(&group->notification_mutex); + return 2; + } + if (group->q_len >= group->max_events) { ret = 2; /* Queue overflow event only if it isn't already queued */ @@ -126,21 +132,6 @@ queue: } /* - * Remove @event from group's notification queue. It is the responsibility of - * the caller to destroy the event. - */ -void fsnotify_remove_event(struct fsnotify_group *group, - struct fsnotify_event *event) -{ - mutex_lock(&group->notification_mutex); - if (!list_empty(&event->list)) { - list_del_init(&event->list); - group->q_len--; - } - mutex_unlock(&group->notification_mutex); -} - -/* * Remove and return the first event from the notification list. It is the * responsibility of the caller to destroy the obtained event */ diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 0cdf497c91ef..164307b99405 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -241,13 +241,11 @@ int ocfs2_set_acl(handle_t *handle, case ACL_TYPE_ACCESS: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - umode_t mode = inode->i_mode; - ret = posix_acl_equiv_mode(acl, &mode); - if (ret < 0) - return ret; + umode_t mode; - if (ret == 0) - acl = NULL; + ret = posix_acl_update_mode(inode, &mode, &acl); + if (ret) + return ret; ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); @@ -322,3 +320,90 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) brelse(di_bh); return acl; } + +int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl; + int ret; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return 0; + + acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (ret) + return ret; + ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, + acl, NULL, NULL); + posix_acl_release(acl); + return ret; +} + +/* + * Initialize the ACLs of a new inode. If parent directory has default ACL, + * then clone to new inode. Called from ocfs2_mknod. + */ +int ocfs2_init_acl(handle_t *handle, + struct inode *inode, + struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dir_bh, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl = NULL; + int ret = 0, ret2; + umode_t mode; + + if (!S_ISLNK(inode->i_mode)) { + if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, + dir_bh); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (!acl) { + mode = inode->i_mode & ~current_umask(); + ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + } + } + if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { + if (S_ISDIR(inode->i_mode)) { + ret = ocfs2_set_acl(handle, inode, di_bh, + ACL_TYPE_DEFAULT, acl, + meta_ac, data_ac); + if (ret) + goto cleanup; + } + mode = inode->i_mode; + ret = __posix_acl_create(&acl, GFP_NOFS, &mode); + if (ret < 0) + return ret; + + ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret2) { + mlog_errno(ret2); + ret = ret2; + goto cleanup; + } + if (ret > 0) { + ret = ocfs2_set_acl(handle, inode, + di_bh, ACL_TYPE_ACCESS, + acl, meta_ac, data_ac); + } + } +cleanup: + posix_acl_release(acl); + return ret; +} diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 3fce68d08625..2783a75b3999 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -35,5 +35,10 @@ int ocfs2_set_acl(handle_t *handle, struct posix_acl *acl, struct ocfs2_alloc_context *meta_ac, struct ocfs2_alloc_context *data_ac); +extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); +extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, + struct buffer_head *, struct buffer_head *, + struct ocfs2_alloc_context *, + struct ocfs2_alloc_context *); #endif /* OCFS2_ACL_H */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index e6795c7c76a8..e4184bd2a954 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1103,7 +1103,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, int ret = 0; struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; unsigned int block_end, block_start; - unsigned int bsize = 1 << inode->i_blkbits; + unsigned int bsize = i_blocksize(inode); if (!page_has_buffers(page)) create_empty_buffers(page, bsize, 0); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 709fbbd44c65..acebc350e98d 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -2070,13 +2070,13 @@ unlock: spin_unlock(&o2hb_live_lock); } -static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item, +static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item, char *page) { return sprintf(page, "%u\n", o2hb_dead_threshold); } -static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item, +static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item, const char *page, size_t count) { unsigned long tmp; @@ -2125,11 +2125,11 @@ static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item, } -CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold); +CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold); CONFIGFS_ATTR(o2hb_heartbeat_group_, mode); static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { - &o2hb_heartbeat_group_attr_threshold, + &o2hb_heartbeat_group_attr_dead_threshold, &o2hb_heartbeat_group_attr_mode, NULL, }; diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index e36d63ff1783..2e11658676eb 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -287,6 +287,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, status = DLM_DENIED; goto bail; } + + if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) { + mlog(0, "last convert request returned DLM_RECOVERING, but " + "owner has already queued and sent ast to me. res %.*s, " + "(cookie=%u:%llu, type=%d, conv=%d)\n", + res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), + lock->ml.type, lock->ml.convert_type); + status = DLM_NORMAL; + goto bail; + } + res->state |= DLM_LOCK_RES_IN_PROGRESS; /* move lock to local convert queue */ /* do not alter lock refcount. switching lists. */ @@ -315,13 +328,22 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; - lock->convert_pending = 0; - /* if it failed, move it back to granted queue */ + /* if it failed, move it back to granted queue. + * if master returns DLM_NORMAL and then down before sending ast, + * it may have already been moved to granted queue, reset to + * DLM_RECOVERING and retry convert */ if (status != DLM_NORMAL) { if (status != DLM_NOTQUEUED) dlm_error(status); dlm_revert_pending_convert(res, lock); + } else if (!lock->convert_pending) { + mlog(0, "%s: res %.*s, owner died and lock has been moved back " + "to granted list, retry convert.\n", + dlm->name, res->lockname.len, res->lockname.name); + status = DLM_RECOVERING; } + + lock->convert_pending = 0; bail: spin_unlock(&res->spinlock); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 42f0cae93a0a..4a338803e7e9 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2064,7 +2064,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, dlm_lock_get(lock); if (lock->convert_pending) { /* move converting lock back to granted */ - BUG_ON(i != DLM_CONVERTING_LIST); mlog(0, "node died with convert pending " "on %.*s. move back to granted list.\n", res->lockname.len, res->lockname.name); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index b002acf50203..60a5f1548cd9 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3321,6 +3321,16 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name, lockres->l_level, new_level); + /* + * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always + * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that + * we can recover correctly from node failure. Otherwise, we may get + * invalid LVB in LKB, but without DLM_SBF_VALNOTVALIDÂ being set. + */ + if (!ocfs2_is_o2cb_active() && + lockres->l_ops->flags & LOCK_TYPE_USES_LVB) + lvb = 1; + if (lvb) dlm_flags |= DLM_LKF_VALBLK; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0e5b4515f92e..1d738723a41a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -808,7 +808,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* We know that zero_from is block aligned */ for (block_start = zero_from; block_start < zero_to; block_start = block_end) { - block_end = block_start + (1 << inode->i_blkbits); + block_end = block_start + i_blocksize(inode); /* * block_start is block-aligned. Bump it by one to force @@ -1268,20 +1268,20 @@ bail_unlock_rw: if (size_change) ocfs2_rw_unlock(inode, 1); bail: - brelse(bh); /* Release quota pointers in case we acquired them */ for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++) dqput(transfer_to[qtype]); if (!status && attr->ia_valid & ATTR_MODE) { - status = posix_acl_chmod(inode, inode->i_mode); + status = ocfs2_acl_chmod(inode, bh); if (status < 0) mlog_errno(status); } if (inode_locked) ocfs2_inode_unlock(inode, 1); + brelse(bh); return status; } @@ -1536,7 +1536,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, u64 start, u64 len) { int ret = 0; - u64 tmpend, end = start + len; + u64 tmpend = 0; + u64 end = start + len; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); unsigned int csize = osb->s_clustersize; handle_t *handle; @@ -1568,18 +1569,31 @@ static int ocfs2_zero_partial_clusters(struct inode *inode, } /* - * We want to get the byte offset of the end of the 1st cluster. + * If start is on a cluster boundary and end is somewhere in another + * cluster, we have not COWed the cluster starting at start, unless + * end is also within the same cluster. So, in this case, we skip this + * first call to ocfs2_zero_range_for_truncate() truncate and move on + * to the next one. */ - tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); - if (tmpend > end) - tmpend = end; + if ((start & (csize - 1)) != 0) { + /* + * We want to get the byte offset of the end of the 1st + * cluster. + */ + tmpend = (u64)osb->s_clustersize + + (start & ~(osb->s_clustersize - 1)); + if (tmpend > end) + tmpend = end; - trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start, - (unsigned long long)tmpend); + trace_ocfs2_zero_partial_clusters_range1( + (unsigned long long)start, + (unsigned long long)tmpend); - ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); - if (ret) - mlog_errno(ret); + ret = ocfs2_zero_range_for_truncate(inode, handle, start, + tmpend); + if (ret) + mlog_errno(ret); + } if (tmpend < end) { /* diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 3123408da935..62af9554541d 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -259,7 +259,6 @@ static int ocfs2_mknod(struct inode *dir, struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; int did_block_signals = 0; - struct posix_acl *default_acl = NULL, *acl = NULL; struct ocfs2_dentry_lock *dl = NULL; trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, @@ -367,12 +366,6 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); - if (status) { - mlog_errno(status); - goto leave; - } - handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, S_ISDIR(mode), xattr_credits)); @@ -421,16 +414,8 @@ static int ocfs2_mknod(struct inode *dir, inc_nlink(dir); } - if (default_acl) { - status = ocfs2_set_acl(handle, inode, new_fe_bh, - ACL_TYPE_DEFAULT, default_acl, - meta_ac, data_ac); - } - if (!status && acl) { - status = ocfs2_set_acl(handle, inode, new_fe_bh, - ACL_TYPE_ACCESS, acl, - meta_ac, data_ac); - } + status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, + meta_ac, data_ac); if (status < 0) { mlog_errno(status); @@ -472,10 +457,6 @@ static int ocfs2_mknod(struct inode *dir, d_instantiate(dentry, inode); status = 0; leave: - if (default_acl) - posix_acl_release(default_acl); - if (acl) - posix_acl_release(acl); if (status < 0 && did_quota_inode) dquot_free_inode(inode); if (handle) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 252119860e6c..6a0c55d7dff0 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4248,20 +4248,12 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct inode *inode = d_inode(old_dentry); struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; - struct posix_acl *default_acl, *acl; - umode_t mode; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; - mode = inode->i_mode; - error = posix_acl_create(dir, &mode, &default_acl, &acl); - if (error) { - mlog_errno(error); - return error; - } - error = ocfs2_create_inode_in_orphan(dir, mode, + error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4300,16 +4292,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { error = ocfs2_init_security_and_acl(dir, new_orphan_inode, - &new_dentry->d_name, - default_acl, acl); + &new_dentry->d_name); if (error) mlog_errno(error); } out: - if (default_acl) - posix_acl_release(default_acl); - if (acl) - posix_acl_release(acl); if (!error) { error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, new_dentry); diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 5d965e83bd43..783bcdce5666 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -48,6 +48,12 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; */ static struct ocfs2_stack_plugin *active_stack; +inline int ocfs2_is_o2cb_active(void) +{ + return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB); +} +EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active); + static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) { struct ocfs2_stack_plugin *p; diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 66334a30cea8..e1b30931974d 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -298,4 +298,7 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); +/* In ocfs2_downconvert_lock(), we need to know which stack we are using */ +int ocfs2_is_o2cb_active(void); + #endif /* STACKGLUE_H */ diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index e9164f09841b..877830b05e12 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7197,12 +7197,10 @@ out: */ int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, - const struct qstr *qstr, - struct posix_acl *default_acl, - struct posix_acl *acl) + const struct qstr *qstr) { - struct buffer_head *dir_bh = NULL; int ret = 0; + struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7215,11 +7213,9 @@ int ocfs2_init_security_and_acl(struct inode *dir, mlog_errno(ret); goto leave; } - - if (!ret && default_acl) - ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); - if (!ret && acl) - ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS); + ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); + if (ret) + mlog_errno(ret); ocfs2_inode_unlock(dir, 0); brelse(dir_bh); diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index f10d5b93c366..1633cc15ea1f 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -94,7 +94,5 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, - const struct qstr *qstr, - struct posix_acl *default_acl, - struct posix_acl *acl); + const struct qstr *qstr); #endif /* OCFS2_XATTR_H */ diff --git a/fs/open.c b/fs/open.c index b6f1e96a7c0b..fbc5c7b230b3 100644 --- a/fs/open.c +++ b/fs/open.c @@ -840,16 +840,12 @@ EXPORT_SYMBOL(file_path); int vfs_open(const struct path *path, struct file *file, const struct cred *cred) { - struct dentry *dentry = path->dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = vfs_select_inode(path->dentry, file->f_flags); - file->f_path = *path; - if (dentry->d_flags & DCACHE_OP_SELECT_INODE) { - inode = dentry->d_op->d_select_inode(dentry, file->f_flags); - if (IS_ERR(inode)) - return PTR_ERR(inode); - } + if (IS_ERR(inode)) + return PTR_ERR(inode); + file->f_path = *path; return do_dentry_open(file, inode, NULL, cred); } @@ -889,6 +885,12 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o int lookup_flags = 0; int acc_mode; + /* + * Clear out all open flags we don't know about so that we don't report + * them in fcntl(F_GETFD) or similar interfaces. + */ + flags &= VALID_OPEN_FLAGS; + if (flags & (O_CREAT | __O_TMPFILE)) op->mode = (mode & S_IALLUGO) | S_IFREG; else @@ -995,14 +997,12 @@ struct file *filp_open(const char *filename, int flags, umode_t mode) EXPORT_SYMBOL(filp_open); struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, - const char *filename, int flags) + const char *filename, int flags, umode_t mode) { struct open_flags op; - int err = build_open_flags(flags, 0, &op); + int err = build_open_flags(flags, mode, &op); if (err) return ERR_PTR(err); - if (flags & O_CREAT) - return ERR_PTR(-EINVAL); return do_file_open_root(dentry, mnt, filename, &op); } EXPORT_SYMBOL(file_open_root); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index eff6319d5037..63a0d0ba36de 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -25,6 +25,7 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) ssize_t list_size, size, value_size = 0; char *buf, *name, *value = NULL; int uninitialized_var(error); + size_t slen; if (!old->d_inode->i_op->getxattr || !new->d_inode->i_op->getxattr) @@ -47,7 +48,18 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) goto out; } - for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { + for (name = buf; list_size; name += slen) { + slen = strnlen(name, list_size) + 1; + + /* underlying fs providing us with an broken xattr list? */ + if (WARN_ON(slen > list_size)) { + error = -EIO; + break; + } + list_size -= slen; + + if (ovl_is_private_xattr(name)) + continue; retry: size = vfs_getxattr(old, name, value, value_size); if (size == -ERANGE) @@ -127,6 +139,8 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) len -= bytes; } + if (!error) + error = vfs_fsync(new_file, 0); fput(new_file); out_fput: fput(old_file); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index a2b1d7ce3e1a..327177df03a5 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -12,6 +12,7 @@ #include <linux/xattr.h> #include <linux/security.h> #include <linux/cred.h> +#include <linux/atomic.h> #include "overlayfs.h" void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) @@ -35,8 +36,10 @@ struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry) { struct dentry *temp; char name[20]; + static atomic_t temp_id = ATOMIC_INIT(0); - snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry); + /* counter is allowed to wrap, since temp dentries are ephemeral */ + snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id)); temp = lookup_one_len(name, workdir, strlen(name)); if (!IS_ERR(temp) && temp->d_inode) { @@ -511,6 +514,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) struct dentry *upper; struct dentry *opaquedir = NULL; int err; + int flags = 0; if (WARN_ON(!workdir)) return -EROFS; @@ -540,46 +544,39 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) if (err) goto out_dput; - whiteout = ovl_whiteout(workdir, dentry); - err = PTR_ERR(whiteout); - if (IS_ERR(whiteout)) + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) goto out_unlock; - upper = ovl_dentry_upper(dentry); - if (!upper) { - upper = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); - err = PTR_ERR(upper); - if (IS_ERR(upper)) - goto kill_whiteout; - - err = ovl_do_rename(wdir, whiteout, udir, upper, 0); - dput(upper); - if (err) - goto kill_whiteout; - } else { - int flags = 0; + err = -ESTALE; + if ((opaquedir && upper != opaquedir) || + (!opaquedir && ovl_dentry_upper(dentry) && + upper != ovl_dentry_upper(dentry))) { + goto out_dput_upper; + } - if (opaquedir) - upper = opaquedir; - err = -ESTALE; - if (upper->d_parent != upperdir) - goto kill_whiteout; + whiteout = ovl_whiteout(workdir, dentry); + err = PTR_ERR(whiteout); + if (IS_ERR(whiteout)) + goto out_dput_upper; - if (is_dir) - flags |= RENAME_EXCHANGE; + if (d_is_dir(upper)) + flags = RENAME_EXCHANGE; - err = ovl_do_rename(wdir, whiteout, udir, upper, flags); - if (err) - goto kill_whiteout; + err = ovl_do_rename(wdir, whiteout, udir, upper, flags); + if (err) + goto kill_whiteout; + if (flags) + ovl_cleanup(wdir, upper); - if (is_dir) - ovl_cleanup(wdir, upper); - } ovl_dentry_version_inc(dentry->d_parent); out_d_drop: d_drop(dentry); dput(whiteout); +out_dput_upper: + dput(upper); out_unlock: unlock_rename(workdir, upperdir); out_dput: @@ -596,21 +593,25 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) { struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); struct inode *dir = upperdir->d_inode; - struct dentry *upper = ovl_dentry_upper(dentry); + struct dentry *upper; int err; mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out_unlock; + err = -ESTALE; - if (upper->d_parent == upperdir) { - /* Don't let d_delete() think it can reset d_inode */ - dget(upper); + if (upper == ovl_dentry_upper(dentry)) { if (is_dir) err = vfs_rmdir(dir, upper); else err = vfs_unlink(dir, upper, NULL); - dput(upper); ovl_dentry_version_inc(dentry->d_parent); } + dput(upper); /* * Keeping this dentry hashed would mean having to release @@ -620,6 +621,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) */ if (!err) d_drop(dentry); +out_unlock: mutex_unlock(&dir->i_mutex); return err; @@ -840,29 +842,39 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, trap = lock_rename(new_upperdir, old_upperdir); - olddentry = ovl_dentry_upper(old); - newdentry = ovl_dentry_upper(new); - if (newdentry) { + + olddentry = lookup_one_len(old->d_name.name, old_upperdir, + old->d_name.len); + err = PTR_ERR(olddentry); + if (IS_ERR(olddentry)) + goto out_unlock; + + err = -ESTALE; + if (olddentry != ovl_dentry_upper(old)) + goto out_dput_old; + + newdentry = lookup_one_len(new->d_name.name, new_upperdir, + new->d_name.len); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_dput_old; + + err = -ESTALE; + if (ovl_dentry_upper(new)) { if (opaquedir) { - newdentry = opaquedir; - opaquedir = NULL; + if (newdentry != opaquedir) + goto out_dput; } else { - dget(newdentry); + if (newdentry != ovl_dentry_upper(new)) + goto out_dput; } } else { new_create = true; - newdentry = lookup_one_len(new->d_name.name, new_upperdir, - new->d_name.len); - err = PTR_ERR(newdentry); - if (IS_ERR(newdentry)) - goto out_unlock; + if (!d_is_negative(newdentry) && + (!new_opaque || !ovl_is_whiteout(newdentry))) + goto out_dput; } - err = -ESTALE; - if (olddentry->d_parent != old_upperdir) - goto out_dput; - if (newdentry->d_parent != new_upperdir) - goto out_dput; if (olddentry == trap) goto out_dput; if (newdentry == trap) @@ -925,6 +937,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, out_dput: dput(newdentry); +out_dput_old: + dput(olddentry); out_unlock: unlock_rename(new_upperdir, old_upperdir); out_revert_creds: diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 05ac9a95e881..220b04f04523 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -63,6 +63,9 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (!err) { upperdentry = ovl_dentry_upper(dentry); + if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) + attr->ia_valid &= ~ATTR_MODE; + mutex_lock(&upperdentry->d_inode->i_mutex); err = notify_change(upperdentry, attr, NULL); if (!err) @@ -216,7 +219,7 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) } -static bool ovl_is_private_xattr(const char *name) +bool ovl_is_private_xattr(const char *name) { return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0; } @@ -274,7 +277,8 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) struct path realpath; enum ovl_path_type type = ovl_path_real(dentry, &realpath); ssize_t res; - int off; + size_t len; + char *s; res = vfs_listxattr(realpath.dentry, list, size); if (res <= 0 || size == 0) @@ -284,17 +288,19 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return res; /* filter out private xattrs */ - for (off = 0; off < res;) { - char *s = list + off; - size_t slen = strlen(s) + 1; + for (s = list, len = res; len;) { + size_t slen = strnlen(s, len) + 1; - BUG_ON(off + slen > res); + /* underlying fs providing us with an broken xattr list? */ + if (WARN_ON(slen > len)) + return -EIO; + len -= slen; if (ovl_is_private_xattr(s)) { res -= slen; - memmove(s, s + slen, res - off); + memmove(s, s + slen, len); } else { - off += slen; + s += slen; } } @@ -412,12 +418,11 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, if (!inode) return NULL; - mode &= S_IFMT; - inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_flags |= S_NOATIME | S_NOCMTIME; + mode &= S_IFMT; switch (mode) { case S_IFDIR: inode->i_private = oe; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e17154aeaae4..c319d5eaabcf 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -174,6 +174,7 @@ ssize_t ovl_getxattr(struct dentry *dentry, const char *name, ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); int ovl_removexattr(struct dentry *dentry, const char *name); struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags); +bool ovl_is_private_xattr(const char *name); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, struct ovl_entry *oe); @@ -181,6 +182,7 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to) { to->i_uid = from->i_uid; to->i_gid = from->i_gid; + to->i_mode = from->i_mode; } /* dir.c */ diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 000b2ed05c29..d70208c0de84 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -276,6 +276,37 @@ static void ovl_dentry_release(struct dentry *dentry) } } +static struct dentry *ovl_d_real(struct dentry *dentry, struct inode *inode) +{ + struct dentry *real; + + if (d_is_dir(dentry)) { + if (!inode || inode == d_inode(dentry)) + return dentry; + goto bug; + } + + real = ovl_dentry_upper(dentry); + if (real && (!inode || inode == d_inode(real))) + return real; + + real = ovl_dentry_lower(dentry); + if (!real) + goto bug; + + if (!inode || inode == d_inode(real)) + return real; + + /* Handle recursion */ + if (real->d_flags & DCACHE_OP_REAL) + return real->d_op->d_real(real, inode); + +bug: + WARN(1, "ovl_d_real(%pd4, %s:%lu\n): real dentry not found\n", dentry, + inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0); + return dentry; +} + static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags) { struct ovl_entry *oe = dentry->d_fsdata; @@ -320,11 +351,13 @@ static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) static const struct dentry_operations ovl_dentry_operations = { .d_release = ovl_dentry_release, .d_select_inode = ovl_d_select_inode, + .d_real = ovl_d_real, }; static const struct dentry_operations ovl_reval_dentry_operations = { .d_release = ovl_dentry_release, .d_select_inode = ovl_d_select_inode, + .d_real = ovl_d_real, .d_revalidate = ovl_dentry_revalidate, .d_weak_revalidate = ovl_dentry_weak_revalidate, }; @@ -343,7 +376,8 @@ static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) static bool ovl_dentry_remote(struct dentry *dentry) { return dentry->d_flags & - (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); + (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | + DCACHE_OP_REAL); } static bool ovl_dentry_weird(struct dentry *dentry) @@ -729,6 +763,10 @@ retry: struct kstat stat = { .mode = S_IFDIR | 0, }; + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = stat.mode, + }; if (work->d_inode) { err = -EEXIST; @@ -744,6 +782,21 @@ retry: err = ovl_create_real(dir, work, &stat, NULL, NULL, true); if (err) goto out_dput; + + err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT); + if (err && err != -ENODATA && err != -EOPNOTSUPP) + goto out_dput; + + err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS); + if (err && err != -ENODATA && err != -EOPNOTSUPP) + goto out_dput; + + /* Clear any inherited mode bits */ + inode_lock(work->d_inode); + err = notify_change(work, &attr, NULL); + inode_unlock(work->d_inode); + if (err) + goto out_dput; } out_unlock: mutex_unlock(&dir->i_mutex); diff --git a/fs/pipe.c b/fs/pipe.c index 42cf8ddf0e55..ab8dad3ccb6a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -38,6 +38,12 @@ unsigned int pipe_max_size = 1048576; */ unsigned int pipe_min_size = PAGE_SIZE; +/* Maximum allocatable pages per user. Hard limit is unset by default, soft + * matches default values. + */ +unsigned long pipe_user_pages_hard; +unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; + /* * We use a start+len construction, which provides full use of the * allocated memory. @@ -583,20 +589,49 @@ pipe_fasync(int fd, struct file *filp, int on) return retval; } +static void account_pipe_buffers(struct pipe_inode_info *pipe, + unsigned long old, unsigned long new) +{ + atomic_long_add(new - old, &pipe->user->pipe_bufs); +} + +static bool too_many_pipe_buffers_soft(struct user_struct *user) +{ + return pipe_user_pages_soft && + atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft; +} + +static bool too_many_pipe_buffers_hard(struct user_struct *user) +{ + return pipe_user_pages_hard && + atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard; +} + struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); if (pipe) { - pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); + unsigned long pipe_bufs = PIPE_DEF_BUFFERS; + struct user_struct *user = get_current_user(); + + if (!too_many_pipe_buffers_hard(user)) { + if (too_many_pipe_buffers_soft(user)) + pipe_bufs = 1; + pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL); + } + if (pipe->bufs) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; - pipe->buffers = PIPE_DEF_BUFFERS; + pipe->buffers = pipe_bufs; + pipe->user = user; + account_pipe_buffers(pipe, 0, pipe_bufs); mutex_init(&pipe->mutex); return pipe; } + free_uid(user); kfree(pipe); } @@ -607,6 +642,8 @@ void free_pipe_info(struct pipe_inode_info *pipe) { int i; + account_pipe_buffers(pipe, pipe->buffers, 0); + free_uid(pipe->user); for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) @@ -998,6 +1035,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); } + account_pipe_buffers(pipe, pipe->buffers, nr_pages); pipe->curbuf = 0; kfree(pipe->bufs); pipe->bufs = bufs; @@ -1069,6 +1107,11 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { ret = -EPERM; goto out; + } else if ((too_many_pipe_buffers_hard(pipe->user) || + too_many_pipe_buffers_soft(pipe->user)) && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; } ret = pipe_set_size(pipe, nr_pages); break; diff --git a/fs/pnode.c b/fs/pnode.c index 6367e1e435c6..d15c63e97ef1 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -24,6 +24,11 @@ static inline struct mount *first_slave(struct mount *p) return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave); } +static inline struct mount *last_slave(struct mount *p) +{ + return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave); +} + static inline struct mount *next_slave(struct mount *p) { return list_entry(p->mnt_slave.next, struct mount, mnt_slave); @@ -164,6 +169,19 @@ static struct mount *propagation_next(struct mount *m, } } +static struct mount *skip_propagation_subtree(struct mount *m, + struct mount *origin) +{ + /* + * Advance m such that propagation_next will not return + * the slaves of m. + */ + if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + m = last_slave(m); + + return m; +} + static struct mount *next_group(struct mount *m, struct mount *origin) { while (1) { @@ -198,10 +216,15 @@ static struct mount *next_group(struct mount *m, struct mount *origin) /* all accesses are serialized by namespace_sem */ static struct user_namespace *user_ns; -static struct mount *last_dest, *last_source, *dest_master; +static struct mount *last_dest, *first_source, *last_source, *dest_master; static struct mountpoint *mp; static struct hlist_head *list; +static inline bool peers(struct mount *m1, struct mount *m2) +{ + return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; +} + static int propagate_one(struct mount *m) { struct mount *child; @@ -212,24 +235,26 @@ static int propagate_one(struct mount *m) /* skip if mountpoint isn't covered by it */ if (!is_subdir(mp->m_dentry, m->mnt.mnt_root)) return 0; - if (m->mnt_group_id == last_dest->mnt_group_id) { + if (peers(m, last_dest)) { type = CL_MAKE_SHARED; } else { struct mount *n, *p; + bool done; for (n = m; ; n = p) { p = n->mnt_master; - if (p == dest_master || IS_MNT_MARKED(p)) { - while (last_dest->mnt_master != p) { - last_source = last_source->mnt_master; - last_dest = last_source->mnt_parent; - } - if (n->mnt_group_id != last_dest->mnt_group_id) { - last_source = last_source->mnt_master; - last_dest = last_source->mnt_parent; - } + if (p == dest_master || IS_MNT_MARKED(p)) break; - } } + do { + struct mount *parent = last_source->mnt_parent; + if (last_source == first_source) + break; + done = parent->mnt_master == p; + if (done && peers(n, parent)) + break; + last_source = last_source->mnt_master; + } while (!done); + type = CL_SLAVE; /* beginning of peer group among the slaves? */ if (IS_MNT_SHARED(m)) @@ -252,7 +277,7 @@ static int propagate_one(struct mount *m) read_sequnlock_excl(&mount_lock); } hlist_add_head(&child->mnt_hash, list); - return 0; + return count_mounts(m->mnt_ns, child); } /* @@ -281,6 +306,7 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, */ user_ns = current->nsproxy->mnt_ns->user_ns; last_dest = dest_mnt; + first_source = source_mnt; last_source = source_mnt; mp = dest_mp; list = tree_list; @@ -316,6 +342,21 @@ out: return ret; } +static struct mount *find_topper(struct mount *mnt) +{ + /* If there is exactly one mount covering mnt completely return it. */ + struct mount *child; + + if (!list_is_singular(&mnt->mnt_mounts)) + return NULL; + + child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child); + if (child->mnt_mountpoint != mnt->mnt.mnt_root) + return NULL; + + return child; +} + /* * return true if the refcount is greater than count */ @@ -336,9 +377,8 @@ static inline int do_refcount_check(struct mount *mnt, int count) */ int propagate_mount_busy(struct mount *mnt, int refcnt) { - struct mount *m, *child; + struct mount *m, *child, *topper; struct mount *parent = mnt->mnt_parent; - int ret = 0; if (mnt == parent) return do_refcount_check(mnt, refcnt); @@ -353,12 +393,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); - if (child && list_empty(&child->mnt_mounts) && - (ret = do_refcount_check(child, 1))) - break; + int count = 1; + child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); + if (!child) + continue; + + /* Is there exactly one mount on the child that covers + * it completely whose reference should be ignored? + */ + topper = find_topper(child); + if (topper) + count += 1; + else if (!list_empty(&child->mnt_mounts)) + continue; + + if (do_refcount_check(child, count)) + return 1; } - return ret; + return 0; } /* @@ -375,63 +427,113 @@ void propagate_mount_unlock(struct mount *mnt) for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { - child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); + child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (child) child->mnt.mnt_flags &= ~MNT_LOCKED; } } -/* - * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted. - */ -static void mark_umount_candidates(struct mount *mnt) +static void umount_one(struct mount *mnt, struct list_head *to_umount) { - struct mount *parent = mnt->mnt_parent; - struct mount *m; - - BUG_ON(parent == mnt); - - for (m = propagation_next(parent, parent); m; - m = propagation_next(m, parent)) { - struct mount *child = __lookup_mnt_last(&m->mnt, - mnt->mnt_mountpoint); - if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) { - SET_MNT_MARK(child); - } - } + CLEAR_MNT_MARK(mnt); + mnt->mnt.mnt_flags |= MNT_UMOUNT; + list_del_init(&mnt->mnt_child); + list_del_init(&mnt->mnt_umounting); + list_move_tail(&mnt->mnt_list, to_umount); } /* * NOTE: unmounting 'mnt' naturally propagates to all other mounts its * parent propagates to. */ -static void __propagate_umount(struct mount *mnt) +static bool __propagate_umount(struct mount *mnt, + struct list_head *to_umount, + struct list_head *to_restore) { - struct mount *parent = mnt->mnt_parent; - struct mount *m; - - BUG_ON(parent == mnt); + bool progress = false; + struct mount *child; - for (m = propagation_next(parent, parent); m; - m = propagation_next(m, parent)) { + /* + * The state of the parent won't change if this mount is + * already unmounted or marked as without children. + */ + if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED)) + goto out; - struct mount *child = __lookup_mnt_last(&m->mnt, - mnt->mnt_mountpoint); - /* - * umount the child only if the child has no children - * and the child is marked safe to unmount. - */ - if (!child || !IS_MNT_MARKED(child)) + /* Verify topper is the only grandchild that has not been + * speculatively unmounted. + */ + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + if (child->mnt_mountpoint == mnt->mnt.mnt_root) continue; - CLEAR_MNT_MARK(child); - if (list_empty(&child->mnt_mounts)) { - list_del_init(&child->mnt_child); - child->mnt.mnt_flags |= MNT_UMOUNT; - list_move_tail(&child->mnt_list, &mnt->mnt_list); + if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child)) + continue; + /* Found a mounted child */ + goto children; + } + + /* Mark mounts that can be unmounted if not locked */ + SET_MNT_MARK(mnt); + progress = true; + + /* If a mount is without children and not locked umount it. */ + if (!IS_MNT_LOCKED(mnt)) { + umount_one(mnt, to_umount); + } else { +children: + list_move_tail(&mnt->mnt_umounting, to_restore); + } +out: + return progress; +} + +static void umount_list(struct list_head *to_umount, + struct list_head *to_restore) +{ + struct mount *mnt, *child, *tmp; + list_for_each_entry(mnt, to_umount, mnt_list) { + list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) { + /* topper? */ + if (child->mnt_mountpoint == mnt->mnt.mnt_root) + list_move_tail(&child->mnt_umounting, to_restore); + else + umount_one(child, to_umount); } } } +static void restore_mounts(struct list_head *to_restore) +{ + /* Restore mounts to a clean working state */ + while (!list_empty(to_restore)) { + struct mount *mnt, *parent; + struct mountpoint *mp; + + mnt = list_first_entry(to_restore, struct mount, mnt_umounting); + CLEAR_MNT_MARK(mnt); + list_del_init(&mnt->mnt_umounting); + + /* Should this mount be reparented? */ + mp = mnt->mnt_mp; + parent = mnt->mnt_parent; + while (parent->mnt.mnt_flags & MNT_UMOUNT) { + mp = parent->mnt_mp; + parent = parent->mnt_parent; + } + if (parent != mnt->mnt_parent) + mnt_change_mountpoint(parent, mp, mnt); + } +} + +static void cleanup_umount_visitations(struct list_head *visited) +{ + while (!list_empty(visited)) { + struct mount *mnt = + list_first_entry(visited, struct mount, mnt_umounting); + list_del_init(&mnt->mnt_umounting); + } +} + /* * collect all mounts that receive propagation from the mount in @list, * and return these additional mounts in the same list. @@ -442,11 +544,68 @@ static void __propagate_umount(struct mount *mnt) int propagate_umount(struct list_head *list) { struct mount *mnt; + LIST_HEAD(to_restore); + LIST_HEAD(to_umount); + LIST_HEAD(visited); + + /* Find candidates for unmounting */ + list_for_each_entry_reverse(mnt, list, mnt_list) { + struct mount *parent = mnt->mnt_parent; + struct mount *m; + + /* + * If this mount has already been visited it is known that it's + * entire peer group and all of their slaves in the propagation + * tree for the mountpoint has already been visited and there is + * no need to visit them again. + */ + if (!list_empty(&mnt->mnt_umounting)) + continue; + + list_add_tail(&mnt->mnt_umounting, &visited); + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + struct mount *child = __lookup_mnt(&m->mnt, + mnt->mnt_mountpoint); + if (!child) + continue; + + if (!list_empty(&child->mnt_umounting)) { + /* + * If the child has already been visited it is + * know that it's entire peer group and all of + * their slaves in the propgation tree for the + * mountpoint has already been visited and there + * is no need to visit this subtree again. + */ + m = skip_propagation_subtree(m, parent); + continue; + } else if (child->mnt.mnt_flags & MNT_UMOUNT) { + /* + * We have come accross an partially unmounted + * mount in list that has not been visited yet. + * Remember it has been visited and continue + * about our merry way. + */ + list_add_tail(&child->mnt_umounting, &visited); + continue; + } + + /* Check the child and parents while progress is made */ + while (__propagate_umount(child, + &to_umount, &to_restore)) { + /* Is the parent a umount candidate? */ + child = child->mnt_parent; + if (list_empty(&child->mnt_umounting)) + break; + } + } + } - list_for_each_entry_reverse(mnt, list, mnt_list) - mark_umount_candidates(mnt); + umount_list(&to_umount, &to_restore); + restore_mounts(&to_restore); + cleanup_umount_visitations(&visited); + list_splice_tail(&to_umount, list); - list_for_each_entry(mnt, list, mnt_list) - __propagate_umount(mnt); return 0; } diff --git a/fs/pnode.h b/fs/pnode.h index 0fcdbe7ca648..dc87e65becd2 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -49,7 +49,10 @@ int get_dominating_id(struct mount *mnt, const struct path *root); unsigned int mnt_get_count(struct mount *mnt); void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, + struct mount *mnt); struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); +int count_mounts(struct mnt_namespace *ns, struct mount *mnt); #endif /* _LINUX_PNODE_H */ diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 4adde1e2cbec..993bb3b5f4d5 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -592,6 +592,37 @@ no_mem: } EXPORT_SYMBOL_GPL(posix_acl_create); +/** + * posix_acl_update_mode - update mode in set_acl + * + * Update the file mode when setting an ACL: compute the new file permission + * bits based on the ACL. In addition, if the ACL is equivalent to the new + * file mode, set *acl to NULL to indicate that no ACL should be set. + * + * As with chmod, clear the setgit bit if the caller is not in the owning group + * or capable of CAP_FSETID (see inode_change_ok). + * + * Called from set_acl inode operations. + */ +int posix_acl_update_mode(struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) +{ + umode_t mode = inode->i_mode; + int error; + + error = posix_acl_equiv_mode(*acl, &mode); + if (error < 0) + return error; + if (error == 0) + *acl = NULL; + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + mode &= ~S_ISGID; + *mode_p = mode; + return 0; +} +EXPORT_SYMBOL(posix_acl_update_mode); + /* * Fix up the uids and gids in posix acl extended attributes in place. */ @@ -788,6 +819,28 @@ posix_acl_xattr_get(const struct xattr_handler *handler, return error; } +int +set_posix_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + if (!IS_POSIXACL(inode)) + return -EOPNOTSUPP; + if (!inode->i_op->set_acl) + return -EOPNOTSUPP; + + if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (acl) { + int ret = posix_acl_valid(acl); + if (ret) + return ret; + } + return inode->i_op->set_acl(inode, acl, type); +} +EXPORT_SYMBOL(set_posix_acl); + static int posix_acl_xattr_set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, @@ -799,30 +852,13 @@ posix_acl_xattr_set(const struct xattr_handler *handler, if (strcmp(name, "") != 0) return -EINVAL; - if (!IS_POSIXACL(inode)) - return -EOPNOTSUPP; - if (!inode->i_op->set_acl) - return -EOPNOTSUPP; - - if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) - return value ? -EACCES : 0; - if (!inode_owner_or_capable(inode)) - return -EPERM; if (value) { acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); - - if (acl) { - ret = posix_acl_valid(acl); - if (ret) - goto out; - } } - - ret = inode->i_op->set_acl(inode, acl, handler->flags); -out: + ret = set_posix_acl(inode, handler->flags, acl); posix_acl_release(acl); return ret; } @@ -867,11 +903,10 @@ int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type) int error; if (type == ACL_TYPE_ACCESS) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) - return 0; - if (error == 0) - acl = NULL; + error = posix_acl_update_mode(inode, + &inode->i_mode, &acl); + if (error) + return error; } inode->i_ctime = CURRENT_TIME; diff --git a/fs/proc/base.c b/fs/proc/base.c index b7de324bec11..dd732400578e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -954,7 +954,8 @@ static ssize_t environ_read(struct file *file, char __user *buf, int ret = 0; struct mm_struct *mm = file->private_data; - if (!mm) + /* Ensure the process spawned far enough to have an environment. */ + if (!mm || !mm->env_end) return 0; page = (char *)__get_free_page(GFP_TEMPORARY); @@ -1544,18 +1545,13 @@ static const struct file_operations proc_pid_set_comm_operations = { static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; - struct mm_struct *mm; struct file *exe_file; task = get_proc_task(d_inode(dentry)); if (!task) return -ENOENT; - mm = get_task_mm(task); + exe_file = get_task_exe_file(task); put_task_struct(task); - if (!mm) - return -ENOENT; - exe_file = get_mm_exe_file(mm); - mmput(mm); if (exe_file) { *exe_path = exe_file->f_path; path_get(&exe_file->f_path); @@ -3062,6 +3058,8 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) iter.tgid += 1, iter = next_tgid(ns, iter)) { char name[PROC_NUMBUF]; int len; + + cond_resched(); if (!has_pid_permissions(ns, iter.task, 2)) continue; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index ff3ffc76a937..3773335791da 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -469,6 +469,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent->data = NULL; ent->proc_fops = NULL; ent->proc_iops = NULL; + parent->nlink++; if (proc_register(parent, ent) < 0) { kfree(ent); parent->nlink--; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index fe5b6e6c4671..4dbe1e2daeca 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -703,7 +703,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx) ctl_dir = container_of(head, struct ctl_dir, header); if (!dir_emit_dots(file, ctx)) - return 0; + goto out; pos = 2; @@ -713,6 +713,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx) break; } } +out: sysctl_head_finish(head); return 0; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 361ab4ee42fc..ec649c92d270 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -121,6 +121,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) return ERR_CAST(sb); + /* + * procfs isn't actually a stacking filesystem; however, there is + * too much magic going on inside it to permit stacking things on + * top of it + */ + sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + if (!proc_parse_options(options, ns)) { deactivate_locked_super(sb); return ERR_PTR(-EINVAL); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 09cd3edde08a..07ef85e19fbc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -248,23 +248,29 @@ static int do_maps_open(struct inode *inode, struct file *file, sizeof(struct proc_maps_private)); } -static pid_t pid_of_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, bool is_pid) +/* + * Indicate if the VMA is a stack for the given task; for + * /proc/PID/maps that is the stack of the main task. + */ +static int is_stack(struct proc_maps_private *priv, + struct vm_area_struct *vma, int is_pid) { - struct inode *inode = priv->inode; - struct task_struct *task; - pid_t ret = 0; + int stack = 0; - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) { - task = task_of_stack(task, vma, is_pid); + if (is_pid) { + stack = vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack; + } else { + struct inode *inode = priv->inode; + struct task_struct *task; + + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) - ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); + stack = vma_is_stack_for_task(vma, task); + rcu_read_unlock(); } - rcu_read_unlock(); - - return ret; + return stack; } static void @@ -289,11 +295,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) /* We don't show the stack guard page in /proc/maps */ start = vma->vm_start; - if (stack_guard_page_start(vma, start)) - start += PAGE_SIZE; end = vma->vm_end; - if (stack_guard_page_end(vma, end)) - end -= PAGE_SIZE; seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", @@ -324,8 +326,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) name = arch_vma_name(vma); if (!name) { - pid_t tid; - if (!mm) { name = "[vdso]"; goto done; @@ -337,21 +337,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) goto done; } - tid = pid_of_stack(priv, vma, is_pid); - if (tid != 0) { - /* - * Thread stack in /proc/PID/task/TID/maps or - * the main process stack. - */ - if (!is_pid || (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack)) { - name = "[stack]"; - } else { - /* Thread stack in /proc/PID/maps */ - seq_pad(m, ' '); - seq_printf(m, "[stack:%d]", tid); - } - } + if (is_stack(priv, vma, is_pid)) + name = "[stack]"; } done: @@ -812,7 +799,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); + pmd_t pmd = *pmdp; + + /* See comment in change_huge_pmd() */ + pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(*pmdp)) + pmd = pmd_mkdirty(pmd); + if (pmd_young(*pmdp)) + pmd = pmd_mkyoung(pmd); pmd = pmd_wrprotect(pmd); pmd = pmd_clear_soft_dirty(pmd); @@ -1435,6 +1429,32 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, return page; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static struct page *can_gather_numa_stats_pmd(pmd_t pmd, + struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *page; + int nid; + + if (!pmd_present(pmd)) + return NULL; + + page = vm_normal_page_pmd(vma, addr, pmd); + if (!page) + return NULL; + + if (PageReserved(page)) + return NULL; + + nid = page_to_nid(page); + if (!node_isset(nid, node_states[N_MEMORY])) + return NULL; + + return page; +} +#endif + static int gather_pte_stats(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -1444,13 +1464,13 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, pte_t *orig_pte; pte_t *pte; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { - pte_t huge_pte = *(pte_t *)pmd; struct page *page; - page = can_gather_numa_stats(huge_pte, vma, addr); + page = can_gather_numa_stats_pmd(*pmd, vma, addr); if (page) - gather_stats(page, md, pte_dirty(huge_pte), + gather_stats(page, md, pmd_dirty(*pmd), HPAGE_PMD_SIZE/PAGE_SIZE); spin_unlock(ptl); return 0; @@ -1458,6 +1478,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; +#endif orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); do { struct page *page = can_gather_numa_stats(*pte, vma, addr); @@ -1539,19 +1560,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) seq_file_path(m, file, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); - } else { - pid_t tid = pid_of_stack(proc_priv, vma, is_pid); - if (tid != 0) { - /* - * Thread stack in /proc/PID/task/TID/maps or - * the main process stack. - */ - if (!is_pid || (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack)) - seq_puts(m, " stack"); - else - seq_printf(m, " stack:%d", tid); - } + } else if (is_stack(proc_priv, vma, is_pid)) { + seq_puts(m, " stack"); } if (is_vm_hugetlb_page(vma)) diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index e0d64c92e4f6..faacb0c0d857 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -123,23 +123,26 @@ unsigned long task_statm(struct mm_struct *mm, return size; } -static pid_t pid_of_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, bool is_pid) +static int is_stack(struct proc_maps_private *priv, + struct vm_area_struct *vma, int is_pid) { - struct inode *inode = priv->inode; - struct task_struct *task; - pid_t ret = 0; - - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) { - task = task_of_stack(task, vma, is_pid); + struct mm_struct *mm = vma->vm_mm; + int stack = 0; + + if (is_pid) { + stack = vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack; + } else { + struct inode *inode = priv->inode; + struct task_struct *task; + + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) - ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); + stack = vma_is_stack_for_task(vma, task); + rcu_read_unlock(); } - rcu_read_unlock(); - - return ret; + return stack; } /* @@ -181,21 +184,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); seq_file_path(m, file, ""); - } else if (mm) { - pid_t tid = pid_of_stack(priv, vma, is_pid); - - if (tid != 0) { - seq_pad(m, ' '); - /* - * Thread stack in /proc/PID/task/TID/maps or - * the main process stack. - */ - if (!is_pid || (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack)) - seq_printf(m, "[stack]"); - else - seq_printf(m, "[stack:%d]", tid); - } + } else if (mm && is_stack(priv, vma, is_pid)) { + seq_pad(m, ' '); + seq_printf(m, "[stack]"); } seq_putc(m, '\n'); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 8ebd9a334085..87645955990d 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -197,6 +197,8 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) if (sb->s_op->show_devname) { seq_puts(m, "device "); err = sb->s_op->show_devname(m, mnt_path.dentry); + if (err) + goto out; } else { if (r->mnt_devname) { seq_puts(m, "device "); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index d8c439d813ce..ac6c78fe19cf 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -178,7 +178,6 @@ static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence) } static const struct file_operations pstore_file_operations = { - .owner = THIS_MODULE, .open = pstore_file_open, .read = pstore_file_read, .llseek = pstore_file_llseek, diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 319c3a60cfa5..59d93acc29c7 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -375,13 +375,14 @@ static void ramoops_free_przs(struct ramoops_context *cxt) { int i; - cxt->max_dump_cnt = 0; if (!cxt->przs) return; - for (i = 0; !IS_ERR_OR_NULL(cxt->przs[i]); i++) + for (i = 0; i < cxt->max_dump_cnt; i++) persistent_ram_free(cxt->przs[i]); + kfree(cxt->przs); + cxt->max_dump_cnt = 0; } static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, @@ -406,17 +407,22 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, GFP_KERNEL); if (!cxt->przs) { dev_err(dev, "failed to initialize a prz array for dumps\n"); - goto fail_prz; + goto fail_mem; } for (i = 0; i < cxt->max_dump_cnt; i++) { cxt->przs[i] = persistent_ram_new(*paddr, cxt->record_size, 0, &cxt->ecc_info, - cxt->memtype); + cxt->memtype, 0); if (IS_ERR(cxt->przs[i])) { err = PTR_ERR(cxt->przs[i]); dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", cxt->record_size, (unsigned long long)*paddr, err); + + while (i > 0) { + i--; + persistent_ram_free(cxt->przs[i]); + } goto fail_prz; } *paddr += cxt->record_size; @@ -424,7 +430,9 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, return 0; fail_prz: - ramoops_free_przs(cxt); + kfree(cxt->przs); +fail_mem: + cxt->max_dump_cnt = 0; return err; } @@ -442,7 +450,8 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, return -ENOMEM; } - *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype); + *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, + cxt->memtype, 0); if (IS_ERR(*prz)) { int err = PTR_ERR(*prz); @@ -583,7 +592,6 @@ static int ramoops_remove(struct platform_device *pdev) struct ramoops_context *cxt = &oops_cxt; pstore_unregister(&cxt->pstore); - cxt->max_dump_cnt = 0; kfree(cxt->pstore.buf); cxt->pstore.bufsize = 0; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 76c3f80efdfa..27300533c2dd 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -48,48 +48,14 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz) } /* increase and wrap the start pointer, returning the old value */ -static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a) +static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a) { int old; int new; + unsigned long flags = 0; - do { - old = atomic_read(&prz->buffer->start); - new = old + a; - while (unlikely(new >= prz->buffer_size)) - new -= prz->buffer_size; - } while (atomic_cmpxchg(&prz->buffer->start, old, new) != old); - - return old; -} - -/* increase the size counter until it hits the max size */ -static void buffer_size_add_atomic(struct persistent_ram_zone *prz, size_t a) -{ - size_t old; - size_t new; - - if (atomic_read(&prz->buffer->size) == prz->buffer_size) - return; - - do { - old = atomic_read(&prz->buffer->size); - new = old + a; - if (new > prz->buffer_size) - new = prz->buffer_size; - } while (atomic_cmpxchg(&prz->buffer->size, old, new) != old); -} - -static DEFINE_RAW_SPINLOCK(buffer_lock); - -/* increase and wrap the start pointer, returning the old value */ -static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a) -{ - int old; - int new; - unsigned long flags; - - raw_spin_lock_irqsave(&buffer_lock, flags); + if (!(prz->flags & PRZ_FLAG_NO_LOCK)) + raw_spin_lock_irqsave(&prz->buffer_lock, flags); old = atomic_read(&prz->buffer->start); new = old + a; @@ -97,19 +63,21 @@ static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a) new -= prz->buffer_size; atomic_set(&prz->buffer->start, new); - raw_spin_unlock_irqrestore(&buffer_lock, flags); + if (!(prz->flags & PRZ_FLAG_NO_LOCK)) + raw_spin_unlock_irqrestore(&prz->buffer_lock, flags); return old; } /* increase the size counter until it hits the max size */ -static void buffer_size_add_locked(struct persistent_ram_zone *prz, size_t a) +static void buffer_size_add(struct persistent_ram_zone *prz, size_t a) { size_t old; size_t new; - unsigned long flags; + unsigned long flags = 0; - raw_spin_lock_irqsave(&buffer_lock, flags); + if (!(prz->flags & PRZ_FLAG_NO_LOCK)) + raw_spin_lock_irqsave(&prz->buffer_lock, flags); old = atomic_read(&prz->buffer->size); if (old == prz->buffer_size) @@ -121,12 +89,10 @@ static void buffer_size_add_locked(struct persistent_ram_zone *prz, size_t a) atomic_set(&prz->buffer->size, new); exit: - raw_spin_unlock_irqrestore(&buffer_lock, flags); + if (!(prz->flags & PRZ_FLAG_NO_LOCK)) + raw_spin_unlock_irqrestore(&prz->buffer_lock, flags); } -static size_t (*buffer_start_add)(struct persistent_ram_zone *, size_t) = buffer_start_add_atomic; -static void (*buffer_size_add)(struct persistent_ram_zone *, size_t) = buffer_size_add_atomic; - static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz, uint8_t *data, size_t len, uint8_t *ecc) { @@ -299,7 +265,7 @@ static void notrace persistent_ram_update(struct persistent_ram_zone *prz, const void *s, unsigned int start, unsigned int count) { struct persistent_ram_buffer *buffer = prz->buffer; - memcpy(buffer->data + start, s, count); + memcpy_toio(buffer->data + start, s, count); persistent_ram_update_ecc(prz, start, count); } @@ -322,8 +288,8 @@ void persistent_ram_save_old(struct persistent_ram_zone *prz) } prz->old_log_size = size; - memcpy(prz->old_log, &buffer->data[start], size - start); - memcpy(prz->old_log + size - start, &buffer->data[0], start); + memcpy_fromio(prz->old_log, &buffer->data[start], size - start); + memcpy_fromio(prz->old_log + size - start, &buffer->data[0], start); } int notrace persistent_ram_write(struct persistent_ram_zone *prz, @@ -426,9 +392,6 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size, return NULL; } - buffer_start_add = buffer_start_add_locked; - buffer_size_add = buffer_size_add_locked; - if (memtype) va = ioremap(start, size); else @@ -487,6 +450,7 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, prz->buffer->sig); } + /* Rewind missing or invalid memory area. */ prz->buffer->sig = sig; persistent_ram_zap(prz); @@ -513,7 +477,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz) struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, u32 sig, struct persistent_ram_ecc_info *ecc_info, - unsigned int memtype) + unsigned int memtype, u32 flags) { struct persistent_ram_zone *prz; int ret = -ENOMEM; @@ -524,6 +488,10 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, goto err; } + /* Initialize general buffer state. */ + raw_spin_lock_init(&prz->buffer_lock); + prz->flags = flags; + ret = persistent_ram_buffer_map(start, size, prz, memtype); if (ret) goto err; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ef0d64b2a6d9..353ff31dcee1 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1398,7 +1398,7 @@ static int dquot_active(const struct inode *inode) static int __dquot_initialize(struct inode *inode, int type) { int cnt, init_needed = 0; - struct dquot **dquots, *got[MAXQUOTAS]; + struct dquot **dquots, *got[MAXQUOTAS] = {}; struct super_block *sb = inode->i_sb; qsize_t rsv; int ret = 0; @@ -1415,7 +1415,6 @@ static int __dquot_initialize(struct inode *inode, int type) int rc; struct dquot *dquot; - got[cnt] = NULL; if (type != -1 && cnt != type) continue; /* diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 96a1bcf33db4..38187300a2b4 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -189,7 +189,7 @@ int reiserfs_commit_page(struct inode *inode, struct page *page, int ret = 0; th.t_trans_id = 0; - blocksize = 1 << inode->i_blkbits; + blocksize = i_blocksize(inode); if (logit) { reiserfs_write_lock(s); @@ -260,10 +260,10 @@ const struct file_operations reiserfs_file_operations = { const struct inode_operations reiserfs_file_inode_operations = { .setattr = reiserfs_setattr, - .setxattr = reiserfs_setxattr, - .getxattr = reiserfs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = reiserfs_listxattr, - .removexattr = reiserfs_removexattr, + .removexattr = generic_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c index b751eea32e20..5db6f45b3fed 100644 --- a/fs/reiserfs/ibalance.c +++ b/fs/reiserfs/ibalance.c @@ -1153,8 +1153,9 @@ int balance_internal(struct tree_balance *tb, insert_ptr); } - memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE); insert_ptr[0] = new_insert_ptr; + if (new_insert_ptr) + memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE); return order; } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 3d8e7e671d5b..60ba35087d12 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -524,7 +524,7 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode, * referenced in convert_tail_for_hole() that may be called from * reiserfs_get_block() */ - bh_result->b_size = (1 << inode->i_blkbits); + bh_result->b_size = i_blocksize(inode); ret = reiserfs_get_block(inode, iblock, bh_result, create | GET_BLOCK_NO_DANGLE); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 47f96988fdd4..3ebc70167e41 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1649,10 +1649,10 @@ const struct inode_operations reiserfs_dir_inode_operations = { .mknod = reiserfs_mknod, .rename = reiserfs_rename, .setattr = reiserfs_setattr, - .setxattr = reiserfs_setxattr, - .getxattr = reiserfs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = reiserfs_listxattr, - .removexattr = reiserfs_removexattr, + .removexattr = generic_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, @@ -1667,10 +1667,10 @@ const struct inode_operations reiserfs_symlink_inode_operations = { .follow_link = page_follow_link_light, .put_link = page_put_link, .setattr = reiserfs_setattr, - .setxattr = reiserfs_setxattr, - .getxattr = reiserfs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = reiserfs_listxattr, - .removexattr = reiserfs_removexattr, + .removexattr = generic_removexattr, .permission = reiserfs_permission, }; @@ -1679,10 +1679,10 @@ const struct inode_operations reiserfs_symlink_inode_operations = { */ const struct inode_operations reiserfs_special_inode_operations = { .setattr = reiserfs_setattr, - .setxattr = reiserfs_setxattr, - .getxattr = reiserfs_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = reiserfs_listxattr, - .removexattr = reiserfs_removexattr, + .removexattr = generic_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, .set_acl = reiserfs_set_acl, diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 4a62fe8cc3bf..f9f3be50081a 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -190,7 +190,15 @@ static int remove_save_link_only(struct super_block *s, static int reiserfs_quota_on_mount(struct super_block *, int); #endif -/* look for uncompleted unlinks and truncates and complete them */ +/* + * Look for uncompleted unlinks and truncates and complete them + * + * Called with superblock write locked. If quotas are enabled, we have to + * release/retake lest we call dquot_quota_on_mount(), proceed to + * schedule_on_each_cpu() in invalidate_bdev() and deadlock waiting for the per + * cpu worklets to complete flush_async_commits() that in turn wait for the + * superblock write lock. + */ static int finish_unfinished(struct super_block *s) { INITIALIZE_PATH(path); @@ -237,7 +245,9 @@ static int finish_unfinished(struct super_block *s) quota_enabled[i] = 0; continue; } + reiserfs_write_unlock(s); ret = reiserfs_quota_on_mount(s, i); + reiserfs_write_lock(s); if (ret < 0) reiserfs_warning(s, "reiserfs-2500", "cannot turn on journaled " diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 66b26fdfff8d..a8dbc93e45eb 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -763,60 +763,6 @@ find_xattr_handler_prefix(const struct xattr_handler **handlers, return xah; } - -/* - * Inode operation getxattr() - */ -ssize_t -reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, - size_t size) -{ - const struct xattr_handler *handler; - - handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - - if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) - return -EOPNOTSUPP; - - return handler->get(handler, dentry, name, buffer, size); -} - -/* - * Inode operation setxattr() - * - * d_inode(dentry)->i_mutex down - */ -int -reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) -{ - const struct xattr_handler *handler; - - handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - - if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) - return -EOPNOTSUPP; - - return handler->set(handler, dentry, name, value, size, flags); -} - -/* - * Inode operation removexattr() - * - * d_inode(dentry)->i_mutex down - */ -int reiserfs_removexattr(struct dentry *dentry, const char *name) -{ - const struct xattr_handler *handler; - - handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - - if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) - return -EOPNOTSUPP; - - return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE); -} - struct listxattr_buf { struct dir_context ctx; size_t size; diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h index 15dde6262c00..613ff5aef94e 100644 --- a/fs/reiserfs/xattr.h +++ b/fs/reiserfs/xattr.h @@ -2,6 +2,7 @@ #include <linux/init.h> #include <linux/list.h> #include <linux/rwsem.h> +#include <linux/xattr.h> struct inode; struct dentry; @@ -18,12 +19,7 @@ int reiserfs_permission(struct inode *inode, int mask); #ifdef CONFIG_REISERFS_FS_XATTR #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir) -ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size); -int reiserfs_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size); -int reiserfs_removexattr(struct dentry *dentry, const char *name); int reiserfs_xattr_get(struct inode *, const char *, void *, size_t); int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int); @@ -92,10 +88,7 @@ static inline void reiserfs_init_xattr_rwsem(struct inode *inode) #else -#define reiserfs_getxattr NULL -#define reiserfs_setxattr NULL #define reiserfs_listxattr NULL -#define reiserfs_removexattr NULL static inline void reiserfs_init_xattr_rwsem(struct inode *inode) { diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 4b34b9dc03dd..9b1824f35501 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -246,13 +246,9 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; - else { - if (error == 0) - acl = NULL; - } } break; case ACL_TYPE_DEFAULT: diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index ac659af431ae..60de069225ba 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -12,26 +12,24 @@ static int security_get(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { - if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) - return -EINVAL; - if (IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); + return reiserfs_xattr_get(d_inode(dentry), + xattr_full_name(handler, name), + buffer, size); } static int security_set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags) { - if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) - return -EINVAL; - if (IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); + return reiserfs_xattr_set(d_inode(dentry), + xattr_full_name(handler, name), + buffer, size, flags); } static size_t security_list(const struct xattr_handler *handler, diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index a338adf1b8b4..ebba1ebf28ad 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -11,26 +11,24 @@ static int trusted_get(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { - if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); + return reiserfs_xattr_get(d_inode(dentry), + xattr_full_name(handler, name), + buffer, size); } static int trusted_set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags) { - if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); + return reiserfs_xattr_set(d_inode(dentry), + xattr_full_name(handler, name), + buffer, size, flags); } static size_t trusted_list(const struct xattr_handler *handler, diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 39c9667191c5..6ac8a8c8bd9c 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -10,24 +10,22 @@ static int user_get(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { - - if (strlen(name) < sizeof(XATTR_USER_PREFIX)) - return -EINVAL; if (!reiserfs_xattrs_user(dentry->d_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); + return reiserfs_xattr_get(d_inode(dentry), + xattr_full_name(handler, name), + buffer, size); } static int user_set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags) { - if (strlen(name) < sizeof(XATTR_USER_PREFIX)) - return -EINVAL; - if (!reiserfs_xattrs_user(dentry->d_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); + return reiserfs_xattr_set(d_inode(dentry), + xattr_full_name(handler, name), + buffer, size, flags); } static size_t user_list(const struct xattr_handler *handler, diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 268733cda397..5f4f1882dc7d 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -74,6 +74,7 @@ #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/uaccess.h> +#include <linux/major.h> #include "internal.h" static struct kmem_cache *romfs_inode_cachep; @@ -415,7 +416,22 @@ static void romfs_destroy_inode(struct inode *inode) static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; - u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + u64 id = 0; + + /* When calling huge_encode_dev(), + * use sb->s_bdev->bd_dev when, + * - CONFIG_ROMFS_ON_BLOCK defined + * use sb->s_dev when, + * - CONFIG_ROMFS_ON_BLOCK undefined and + * - CONFIG_ROMFS_ON_MTD defined + * leave id as 0 when, + * - CONFIG_ROMFS_ON_BLOCK undefined and + * - CONFIG_ROMFS_ON_MTD undefined + */ + if (sb->s_bdev) + id = huge_encode_dev(sb->s_bdev->bd_dev); + else if (sb->s_dev) + id = huge_encode_dev(sb->s_dev); buf->f_type = ROMFS_MAGIC; buf->f_namelen = ROMFS_MAXFN; @@ -488,6 +504,11 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags |= MS_RDONLY | MS_NOATIME; sb->s_op = &romfs_super_ops; +#ifdef CONFIG_ROMFS_ON_MTD + /* Use same dev ID from the underlying mtdblock device */ + if (sb->s_mtd) + sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, sb->s_mtd->index); +#endif /* read the image superblock and check it */ rsb = kmalloc(512, GFP_KERNEL); if (!rsb) diff --git a/fs/seq_file.c b/fs/seq_file.c index e85664b7c7d9..6dc4296eed62 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -72,9 +72,10 @@ int seq_open(struct file *file, const struct seq_operations *op) mutex_init(&p->lock); p->op = op; -#ifdef CONFIG_USER_NS - p->user_ns = file->f_cred->user_ns; -#endif + + // No refcounting: the lifetime of 'p' is constrained + // to the lifetime of the file. + p->file = file; /* * Wrappers around seq_open(e.g. swaps_open) need to be @@ -222,8 +223,10 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) size -= n; buf += n; copied += n; - if (!m->count) + if (!m->count) { + m->from = 0; m->index++; + } if (!size) goto Done; } diff --git a/fs/splice.c b/fs/splice.c index 4cf700d50b40..8398974e1538 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -185,6 +185,9 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, unsigned int spd_pages = spd->nr_pages; int ret, do_wakeup, page_nr; + if (!spd_pages) + return 0; + ret = 0; do_wakeup = 0; page_nr = 0; @@ -208,6 +211,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, buf->len = spd->partial[page_nr].len; buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; + buf->flags = 0; if (spd->flags & SPLICE_F_GIFT) buf->flags |= PIPE_BUF_FLAG_GIFT; diff --git a/fs/stat.c b/fs/stat.c index d4a61d8dc021..004dd77c3b93 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -31,7 +31,7 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; stat->ctime = inode->i_ctime; - stat->blksize = (1 << inode->i_blkbits); + stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; } @@ -454,6 +454,7 @@ void __inode_add_bytes(struct inode *inode, loff_t bytes) inode->i_bytes -= 512; } } +EXPORT_SYMBOL(__inode_add_bytes); void inode_add_bytes(struct inode *inode, loff_t bytes) { diff --git a/fs/super.c b/fs/super.c index f5f4b328f860..d4d2591b77c8 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1326,8 +1326,8 @@ int freeze_super(struct super_block *sb) } } /* - * This is just for debugging purposes so that fs can warn if it - * sees write activity when frozen is set to SB_FREEZE_COMPLETE. + * For debugging purposes so that fs can warn if it sees write activity + * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ sb->s_writers.frozen = SB_FREEZE_COMPLETE; up_write(&sb->s_umount); @@ -1346,7 +1346,7 @@ int thaw_super(struct super_block *sb) int error; down_write(&sb->s_umount); - if (sb->s_writers.frozen == SB_UNFROZEN) { + if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) { up_write(&sb->s_umount); return -EINVAL; } diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index f35523d4fa3a..39c75a86c67f 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -108,16 +108,24 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, { const struct sysfs_ops *ops = sysfs_file_ops(of->kn); struct kobject *kobj = of->kn->parent->priv; - size_t len; + ssize_t len; /* * If buf != of->prealloc_buf, we don't know how * large it is, so cannot safely pass it to ->show */ - if (pos || WARN_ON_ONCE(buf != of->prealloc_buf)) + if (WARN_ON_ONCE(buf != of->prealloc_buf)) return 0; len = ops->show(kobj, of->kn->priv, buf); - return min(count, len); + if (len < 0) + return len; + if (pos) { + if (len <= pos) + return 0; + len -= pos; + memmove(buf, buf + pos, len); + } + return min_t(ssize_t, count, len); } /* kernfs write callback for regular sysfs files */ diff --git a/fs/timerfd.c b/fs/timerfd.c index 053818dd6c18..1327a02ec778 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -40,6 +40,7 @@ struct timerfd_ctx { short unsigned settime_flags; /* to show in fdinfo */ struct rcu_head rcu; struct list_head clist; + spinlock_t cancel_lock; bool might_cancel; }; @@ -112,7 +113,7 @@ void timerfd_clock_was_set(void) rcu_read_unlock(); } -static void timerfd_remove_cancel(struct timerfd_ctx *ctx) +static void __timerfd_remove_cancel(struct timerfd_ctx *ctx) { if (ctx->might_cancel) { ctx->might_cancel = false; @@ -122,6 +123,13 @@ static void timerfd_remove_cancel(struct timerfd_ctx *ctx) } } +static void timerfd_remove_cancel(struct timerfd_ctx *ctx) +{ + spin_lock(&ctx->cancel_lock); + __timerfd_remove_cancel(ctx); + spin_unlock(&ctx->cancel_lock); +} + static bool timerfd_canceled(struct timerfd_ctx *ctx) { if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX) @@ -132,6 +140,7 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx) static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) { + spin_lock(&ctx->cancel_lock); if ((ctx->clockid == CLOCK_REALTIME || ctx->clockid == CLOCK_REALTIME_ALARM) && (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) { @@ -141,9 +150,10 @@ static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) list_add_rcu(&ctx->clist, &cancel_list); spin_unlock(&cancel_lock); } - } else if (ctx->might_cancel) { - timerfd_remove_cancel(ctx); + } else { + __timerfd_remove_cancel(ctx); } + spin_unlock(&ctx->cancel_lock); } static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) @@ -395,6 +405,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) return -ENOMEM; init_waitqueue_head(&ctx->wqh); + spin_lock_init(&ctx->cancel_lock); ctx->clockid = clockid; if (isalarm(ctx)) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index e49bd2808bf3..f5d5ee43ae6e 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -350,7 +350,7 @@ static unsigned int vfs_dent_type(uint8_t type) */ static int ubifs_readdir(struct file *file, struct dir_context *ctx) { - int err; + int err = 0; struct qstr nm; union ubifs_key key; struct ubifs_dent_node *dent; @@ -452,14 +452,20 @@ out: kfree(file->private_data); file->private_data = NULL; - if (err != -ENOENT) { + if (err != -ENOENT) ubifs_err(c, "cannot find next direntry, error %d", err); - return err; - } + else + /* + * -ENOENT is a non-fatal error in this context, the TNC uses + * it to indicate that the cursor moved past the current directory + * and readdir() has to stop. + */ + err = 0; + /* 2 is a special value indicating that there are no more direntries */ ctx->pos = 2; - return 0; + return err; } /* Free saved readdir() state when the directory is closed */ diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 0edc12856147..b895af7d8d80 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -52,6 +52,7 @@ #include "ubifs.h" #include <linux/mount.h> #include <linux/slab.h> +#include <linux/migrate.h> static int read_block(struct inode *inode, void *addr, unsigned int block, struct ubifs_data_node *dn) @@ -1452,6 +1453,26 @@ static int ubifs_set_page_dirty(struct page *page) return ret; } +#ifdef CONFIG_MIGRATION +static int ubifs_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + int rc; + + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + if (PagePrivate(page)) { + ClearPagePrivate(page); + SetPagePrivate(newpage); + } + + migrate_page_copy(newpage, page); + return MIGRATEPAGE_SUCCESS; +} +#endif + static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) { /* @@ -1591,6 +1612,9 @@ const struct address_space_operations ubifs_file_address_operations = { .write_end = ubifs_write_end, .invalidatepage = ubifs_invalidatepage, .set_page_dirty = ubifs_set_page_dirty, +#ifdef CONFIG_MIGRATION + .migratepage = ubifs_migrate_page, +#endif .releasepage = ubifs_releasepage, }; diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index fa9a20cc60d6..fe5e8d4970ae 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -34,6 +34,11 @@ #include <linux/slab.h> #include "ubifs.h" +static int try_read_node(const struct ubifs_info *c, void *buf, int type, + int len, int lnum, int offs); +static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_zbranch *zbr, void *node); + /* * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions. * @NAME_LESS: name corresponding to the first argument is less than second @@ -402,7 +407,19 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr, return 0; } - err = ubifs_tnc_read_node(c, zbr, node); + if (c->replaying) { + err = fallible_read_node(c, &zbr->key, zbr, node); + /* + * When the node was not found, return -ENOENT, 0 otherwise. + * Negative return codes stay as-is. + */ + if (err == 0) + err = -ENOENT; + else if (err == 1) + err = 0; + } else { + err = ubifs_tnc_read_node(c, zbr, node); + } if (err) return err; @@ -2766,7 +2783,11 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, if (nm->name) { if (err) { /* Handle collisions */ - err = resolve_collision(c, key, &znode, &n, nm); + if (c->replaying) + err = fallible_resolve_collision(c, key, &znode, &n, + nm, 0); + else + err = resolve_collision(c, key, &znode, &n, nm); dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n); if (unlikely(err < 0)) diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index b45345d701e7..51157da3f76e 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -370,7 +370,7 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) p = c->gap_lebs; do { - ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs); + ubifs_assert(p < c->gap_lebs + c->lst.idx_lebs); written = layout_leb_in_gaps(c, p); if (written < 0) { err = written; diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index e8b01b721e99..b5bf23b34241 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -173,6 +173,7 @@ out_cancel: host_ui->xattr_cnt -= 1; host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); host_ui->xattr_size -= CALC_XATTR_BYTES(size); + host_ui->xattr_names -= nm->len; mutex_unlock(&host_ui->ui_mutex); out_free: make_bad_inode(inode); @@ -533,6 +534,7 @@ out_cancel: host_ui->xattr_cnt += 1; host_ui->xattr_size += CALC_DENT_SIZE(nm->len); host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_names += nm->len; mutex_unlock(&host_ui->ui_mutex); ubifs_release_budget(c, &req); make_bad_inode(inode); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 566df9b5a6cb..0e659d9c69a1 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1206,7 +1206,7 @@ int udf_setsize(struct inode *inode, loff_t newsize) { int err; struct udf_inode_info *iinfo; - int bsize = 1 << inode->i_blkbits; + int bsize = i_blocksize(inode); if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) @@ -1235,8 +1235,8 @@ int udf_setsize(struct inode *inode, loff_t newsize) return err; } set_size: - truncate_setsize(inode, newsize); up_write(&iinfo->i_data_sem); + truncate_setsize(inode, newsize); } else { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); @@ -1253,9 +1253,9 @@ set_size: udf_get_block); if (err) return err; + truncate_setsize(inode, newsize); down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); - truncate_setsize(inode, newsize); udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); } diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index dc5fae601c24..637e17cb0edd 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -81,7 +81,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) ufs_error (sb, "ufs_free_fragments", "bit already cleared for fragment %u", i); } - + + inode_sub_bytes(inode, count << uspi->s_fshift); fs32_add(sb, &ucg->cg_cs.cs_nffree, count); uspi->cs_total.cs_nffree += count; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); @@ -183,6 +184,7 @@ do_more: ufs_error(sb, "ufs_free_blocks", "freeing free fragment"); } ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); + inode_sub_bytes(inode, uspi->s_fpb << uspi->s_fshift); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) ufs_clusteracct (sb, ucpi, blkno, 1); @@ -494,6 +496,20 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, return 0; } +static bool try_add_frags(struct inode *inode, unsigned frags) +{ + unsigned size = frags * i_blocksize(inode); + spin_lock(&inode->i_lock); + __inode_add_bytes(inode, size); + if (unlikely((u32)inode->i_blocks != inode->i_blocks)) { + __inode_sub_bytes(inode, size); + spin_unlock(&inode->i_lock); + return false; + } + spin_unlock(&inode->i_lock); + return true; +} + static u64 ufs_add_fragments(struct inode *inode, u64 fragment, unsigned oldcount, unsigned newcount) { @@ -530,6 +546,9 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, for (i = oldcount; i < newcount; i++) if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i)) return 0; + + if (!try_add_frags(inode, count)) + return 0; /* * Block can be extended */ @@ -647,6 +666,7 @@ cg_found: ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); i = uspi->s_fpb - count; + inode_sub_bytes(inode, i << uspi->s_fshift); fs32_add(sb, &ucg->cg_cs.cs_nffree, i); uspi->cs_total.cs_nffree += i; fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i); @@ -657,6 +677,8 @@ cg_found: result = ufs_bitmap_search (sb, ucpi, goal, allocsize); if (result == INVBLOCK) return 0; + if (!try_add_frags(inode, count)) + return 0; for (i = 0; i < count; i++) ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i); @@ -716,6 +738,8 @@ norot: return INVBLOCK; ucpi->c_rotor = result; gotit: + if (!try_add_frags(inode, uspi->s_fpb)) + return 0; blkno = ufs_fragstoblks(result); ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index a064cf44b143..1f69bb9b1e9d 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -235,7 +235,8 @@ ufs_extend_tail(struct inode *inode, u64 writes_to, p = ufs_get_direct_data_ptr(uspi, ufsi, block); tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p), - new_size, err, locked_page); + new_size - (lastfrag & uspi->s_fpbmask), err, + locked_page); return tmp != 0; } @@ -284,7 +285,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, goal += uspi->s_fpb; } tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), - goal, uspi->s_fpb, err, locked_page); + goal, nfrags, err, locked_page); if (!tmp) { *err = -ENOSPC; @@ -402,7 +403,9 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff if (!create) { phys64 = ufs_frag_map(inode, offsets, depth); - goto out; + if (phys64) + map_bh(bh_result, sb, phys64 + frag); + return 0; } /* This code entered only while writing ....? */ diff --git a/fs/ufs/super.c b/fs/ufs/super.c index f6390eec02ca..10f364490833 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -746,6 +746,23 @@ static void ufs_put_super(struct super_block *sb) return; } +static u64 ufs_max_bytes(struct super_block *sb) +{ + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + int bits = uspi->s_apbshift; + u64 res; + + if (bits > 21) + res = ~0ULL; + else + res = UFS_NDADDR + (1LL << bits) + (1LL << (2*bits)) + + (1LL << (3*bits)); + + if (res >= (MAX_LFS_FILESIZE >> uspi->s_bshift)) + return MAX_LFS_FILESIZE; + return res << uspi->s_bshift; +} + static int ufs_fill_super(struct super_block *sb, void *data, int silent) { struct ufs_sb_info * sbi; @@ -1212,6 +1229,7 @@ magic_found: "fast symlink size (%u)\n", uspi->s_maxsymlinklen); uspi->s_maxsymlinklen = maxsymlen; } + sb->s_maxbytes = ufs_max_bytes(sb); sb->s_max_links = UFS_LINK_MAX; inode = ufs_iget(sb, UFS_ROOTINO); diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 954175928240..3f9463f8cf2f 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -473,15 +473,19 @@ static inline unsigned _ubh_find_last_zero_bit_( static inline int _ubh_isblockset_(struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, unsigned begin, unsigned block) { + u8 mask; switch (uspi->s_fpb) { case 8: return (*ubh_get_addr (ubh, begin + block) == 0xff); case 4: - return (*ubh_get_addr (ubh, begin + (block >> 1)) == (0x0f << ((block & 0x01) << 2))); + mask = 0x0f << ((block & 0x01) << 2); + return (*ubh_get_addr (ubh, begin + (block >> 1)) & mask) == mask; case 2: - return (*ubh_get_addr (ubh, begin + (block >> 2)) == (0x03 << ((block & 0x03) << 1))); + mask = 0x03 << ((block & 0x03) << 1); + return (*ubh_get_addr (ubh, begin + (block >> 2)) & mask) == mask; case 1: - return (*ubh_get_addr (ubh, begin + (block >> 3)) == (0x01 << (block & 0x07))); + mask = 0x01 << (block & 0x07); + return (*ubh_get_addr (ubh, begin + (block >> 3)) & mask) == mask; } return 0; } diff --git a/fs/utimes.c b/fs/utimes.c index aa138d64560a..cb771c30d102 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -87,20 +87,7 @@ static int utimes_common(struct path *path, struct timespec *times) */ newattrs.ia_valid |= ATTR_TIMES_SET; } else { - /* - * If times is NULL (or both times are UTIME_NOW), - * then we need to check permissions, because - * inode_change_ok() won't do it. - */ - error = -EACCES; - if (IS_IMMUTABLE(inode)) - goto mnt_drop_write_and_out; - - if (!inode_owner_or_capable(inode)) { - error = inode_permission(inode, MAY_WRITE); - if (error) - goto mnt_drop_write_and_out; - } + newattrs.ia_valid |= ATTR_TOUCH; } retry_deleg: mutex_lock(&inode->i_mutex); @@ -112,7 +99,6 @@ retry_deleg: goto retry_deleg; } -mnt_drop_write_and_out: mnt_drop_write(path->mnt); out: return error; diff --git a/fs/xattr.c b/fs/xattr.c index 9b932b95d74e..f0da9d24e9ca 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -442,7 +442,7 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, size = XATTR_SIZE_MAX; kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!kvalue) { - vvalue = vmalloc(size); + vvalue = vzalloc(size); if (!vvalue) return -ENOMEM; kvalue = vvalue; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 3479294c1d58..e1e7fe3b5424 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -535,6 +535,7 @@ xfs_agfl_write_verify( } const struct xfs_buf_ops xfs_agfl_buf_ops = { + .name = "xfs_agfl", .verify_read = xfs_agfl_read_verify, .verify_write = xfs_agfl_write_verify, }; @@ -2339,6 +2340,7 @@ xfs_agf_write_verify( } const struct xfs_buf_ops xfs_agf_buf_ops = { + .name = "xfs_agf", .verify_read = xfs_agf_read_verify, .verify_write = xfs_agf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 90de071dd4c2..eb8bbfe85484 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -379,6 +379,7 @@ xfs_allocbt_write_verify( } const struct xfs_buf_ops xfs_allocbt_buf_ops = { + .name = "xfs_allocbt", .verify_read = xfs_allocbt_read_verify, .verify_write = xfs_allocbt_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index aa187f7ba2dd..01a5ecfedfcf 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -328,6 +328,7 @@ xfs_attr3_leaf_read_verify( } const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { + .name = "xfs_attr3_leaf", .verify_read = xfs_attr3_leaf_read_verify, .verify_write = xfs_attr3_leaf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 5ab95ffa4ae9..f3ed9bf0b065 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -201,6 +201,7 @@ xfs_attr3_rmt_write_verify( } const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { + .name = "xfs_attr3_rmt", .verify_read = xfs_attr3_rmt_read_verify, .verify_write = xfs_attr3_rmt_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 119c2422aac7..75884aecf920 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -2179,8 +2179,10 @@ xfs_bmap_add_extent_delay_real( } temp = xfs_bmap_worst_indlen(bma->ip, temp); temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); - diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + diff = (int)(temp + temp2 - + (startblockval(PREV.br_startblock) - + (bma->cur ? + bma->cur->bc_private.b.allocated : 0))); if (diff > 0) { error = xfs_mod_fdblocks(bma->ip->i_mount, -((int64_t)diff), false); @@ -2232,7 +2234,6 @@ xfs_bmap_add_extent_delay_real( temp = da_new; if (bma->cur) temp += bma->cur->bc_private.b.allocated; - ASSERT(temp <= da_old); if (temp < da_old) xfs_mod_fdblocks(bma->ip->i_mount, (int64_t)(da_old - temp), false); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 6b0cf6546a82..1637c37bfbaa 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -720,6 +720,7 @@ xfs_bmbt_write_verify( } const struct xfs_buf_ops xfs_bmbt_buf_ops = { + .name = "xfs_bmbt", .verify_read = xfs_bmbt_read_verify, .verify_write = xfs_bmbt_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index af1bbee5586e..28bc5e78b110 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4064,7 +4064,7 @@ xfs_btree_change_owner( xfs_btree_readahead_ptr(cur, ptr, 1); /* save for the next iteration of the loop */ - lptr = *ptr; + xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); } /* for each buffer in the level */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e89a0f8f827c..097bf7717d80 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -245,6 +245,7 @@ xfs_da3_node_read_verify( } const struct xfs_buf_ops xfs_da3_node_buf_ops = { + .name = "xfs_da3_node", .verify_read = xfs_da3_node_read_verify, .verify_write = xfs_da3_node_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 9c10e2b8cfcb..aa17cb788946 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -123,6 +123,7 @@ xfs_dir3_block_write_verify( } const struct xfs_buf_ops xfs_dir3_block_buf_ops = { + .name = "xfs_dir3_block", .verify_read = xfs_dir3_block_read_verify, .verify_write = xfs_dir3_block_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index af71a84f343c..725fc7841fde 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -305,11 +305,13 @@ xfs_dir3_data_write_verify( } const struct xfs_buf_ops xfs_dir3_data_buf_ops = { + .name = "xfs_dir3_data", .verify_read = xfs_dir3_data_read_verify, .verify_write = xfs_dir3_data_write_verify, }; static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { + .name = "xfs_dir3_data_reada", .verify_read = xfs_dir3_data_reada_verify, .verify_write = xfs_dir3_data_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 3923e1f94697..b887fb2a2bcf 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -245,11 +245,13 @@ xfs_dir3_leafn_write_verify( } const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { + .name = "xfs_dir3_leaf1", .verify_read = xfs_dir3_leaf1_read_verify, .verify_write = xfs_dir3_leaf1_write_verify, }; const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { + .name = "xfs_dir3_leafn", .verify_read = xfs_dir3_leafn_read_verify, .verify_write = xfs_dir3_leafn_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 70b0cb2fd556..63ee03db796c 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -150,6 +150,7 @@ xfs_dir3_free_write_verify( } const struct xfs_buf_ops xfs_dir3_free_buf_ops = { + .name = "xfs_dir3_free", .verify_read = xfs_dir3_free_read_verify, .verify_write = xfs_dir3_free_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 5331b7f0460c..ac9a003dd29a 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -54,7 +54,7 @@ xfs_dqcheck( xfs_dqid_t id, uint type, /* used only when IO_dorepair is true */ uint flags, - char *str) + const char *str) { xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; int errs = 0; @@ -191,8 +191,7 @@ xfs_dquot_buf_verify_crc( if (mp->m_quotainfo) ndquots = mp->m_quotainfo->qi_dqperchunk; else - ndquots = xfs_calc_dquots_per_chunk( - XFS_BB_TO_FSB(mp, bp->b_length)); + ndquots = xfs_calc_dquots_per_chunk(bp->b_length); for (i = 0; i < ndquots; i++, d++) { if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), @@ -207,7 +206,8 @@ xfs_dquot_buf_verify_crc( STATIC bool xfs_dquot_buf_verify( struct xfs_mount *mp, - struct xfs_buf *bp) + struct xfs_buf *bp, + int warn) { struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; xfs_dqid_t id = 0; @@ -240,8 +240,7 @@ xfs_dquot_buf_verify( if (i == 0) id = be32_to_cpu(ddq->d_id); - error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, - "xfs_dquot_buf_verify"); + error = xfs_dqcheck(mp, ddq, id + i, 0, warn, __func__); if (error) return false; } @@ -256,7 +255,7 @@ xfs_dquot_buf_read_verify( if (!xfs_dquot_buf_verify_crc(mp, bp)) xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_dquot_buf_verify(mp, bp)) + else if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) @@ -264,6 +263,25 @@ xfs_dquot_buf_read_verify( } /* + * readahead errors are silent and simply leave the buffer as !done so a real + * read will then be run with the xfs_dquot_buf_ops verifier. See + * xfs_inode_buf_verify() for why we use EIO and ~XBF_DONE here rather than + * reporting the failure. + */ +static void +xfs_dquot_buf_readahead_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify_crc(mp, bp) || + !xfs_dquot_buf_verify(mp, bp, 0)) { + xfs_buf_ioerror(bp, -EIO); + bp->b_flags &= ~XBF_DONE; + } +} + +/* * we don't calculate the CRC here as that is done when the dquot is flushed to * the buffer after the update is done. This ensures that the dquot in the * buffer always has an up-to-date CRC value. @@ -274,7 +292,7 @@ xfs_dquot_buf_write_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if (!xfs_dquot_buf_verify(mp, bp)) { + if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) { xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; @@ -282,7 +300,13 @@ xfs_dquot_buf_write_verify( } const struct xfs_buf_ops xfs_dquot_buf_ops = { + .name = "xfs_dquot", .verify_read = xfs_dquot_buf_read_verify, .verify_write = xfs_dquot_buf_write_verify, }; +const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { + .name = "xfs_dquot_ra", + .verify_read = xfs_dquot_buf_readahead_verify, + .verify_write = xfs_dquot_buf_write_verify, +}; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 70c1db99f6a7..66d702e6b9ff 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2572,6 +2572,7 @@ xfs_agi_write_verify( } const struct xfs_buf_ops xfs_agi_buf_ops = { + .name = "xfs_agi", .verify_read = xfs_agi_read_verify, .verify_write = xfs_agi_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index f39b285beb19..6dd44f9ea727 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -304,6 +304,7 @@ xfs_inobt_write_verify( } const struct xfs_buf_ops xfs_inobt_buf_ops = { + .name = "xfs_inobt", .verify_read = xfs_inobt_read_verify, .verify_write = xfs_inobt_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 65485cfc4ade..7183b7ea065b 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -68,6 +68,8 @@ xfs_inobp_check( * recovery and we don't get unnecssary panics on debug kernels. We use EIO here * because all we want to do is say readahead failed; there is no-one to report * the error to, so this will distinguish it from a non-ra verifier failure. + * Changes to this readahead error behavour also need to be reflected in + * xfs_dquot_buf_readahead_verify(). */ static void xfs_inode_buf_verify( @@ -134,11 +136,13 @@ xfs_inode_buf_write_verify( } const struct xfs_buf_ops xfs_inode_buf_ops = { + .name = "xfs_inode", .verify_read = xfs_inode_buf_read_verify, .verify_write = xfs_inode_buf_write_verify, }; const struct xfs_buf_ops xfs_inode_buf_ra_ops = { + .name = "xxfs_inode_ra", .verify_read = xfs_inode_buf_readahead_verify, .verify_write = xfs_inode_buf_write_verify, }; @@ -295,6 +299,14 @@ xfs_dinode_verify( if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return false; + /* don't allow invalid i_size */ + if (be64_to_cpu(dip->di_size) & (1ULL << 63)) + return false; + + /* No zero-length symlinks. */ + if (S_ISLNK(be16_to_cpu(dip->di_mode)) && dip->di_size == 0) + return false; + /* only version 3 or greater inodes are extensively verified here */ if (dip->di_version < 3) return true; diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 1b0a08379759..f51078f1e92a 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -153,7 +153,7 @@ typedef __uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq, - xfs_dqid_t id, uint type, uint flags, char *str); + xfs_dqid_t id, uint type, uint flags, const char *str); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a0b071d881a0..7088be6afb3c 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -581,7 +581,8 @@ xfs_sb_verify( * Only check the in progress field for the primary superblock as * mkfs.xfs doesn't clear it from secondary superblocks. */ - return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR, + return xfs_mount_validate_sb(mp, &sb, + bp->b_maps[0].bm_bn == XFS_SB_DADDR, check_version); } @@ -679,11 +680,13 @@ xfs_sb_write_verify( } const struct xfs_buf_ops xfs_sb_buf_ops = { + .name = "xfs_sb", .verify_read = xfs_sb_read_verify, .verify_write = xfs_sb_write_verify, }; const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { + .name = "xfs_sb_quiet", .verify_read = xfs_sb_quiet_read_verify, .verify_write = xfs_sb_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 5be529707903..15c3ceb845b9 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -49,6 +49,7 @@ extern const struct xfs_buf_ops xfs_inobt_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; extern const struct xfs_buf_ops xfs_dquot_buf_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops; extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index cb6fd20a4d3d..2e2c6716b623 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -168,6 +168,7 @@ xfs_symlink_write_verify( } const struct xfs_buf_ops xfs_symlink_buf_ops = { + .name = "xfs_symlink", .verify_read = xfs_symlink_read_verify, .verify_write = xfs_symlink_write_verify, }; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 6bb470fbb8e8..c5101a3295d8 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -288,16 +288,11 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) return error; if (type == ACL_TYPE_ACCESS) { - umode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); - - if (error <= 0) { - acl = NULL; - - if (error < 0) - return error; - } + umode_t mode; + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) + return error; error = xfs_set_mode(inode, mode); if (error) return error; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 29e7e5dd5178..a9063ac50c4e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -288,7 +288,7 @@ xfs_map_blocks( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - ssize_t count = 1 << inode->i_blkbits; + ssize_t count = i_blocksize(inode); xfs_fileoff_t offset_fsb, end_fsb; int error = 0; int bmapi_flags = XFS_BMAPI_ENTIRE; @@ -921,7 +921,7 @@ xfs_aops_discard_page( break; } next_buffer: - offset += 1 << inode->i_blkbits; + offset += i_blocksize(inode); } while ((bh = bh->b_this_page) != head); @@ -1363,7 +1363,7 @@ xfs_map_trim_size( offset + mapping_size >= i_size_read(inode)) { /* limit mapping to block that spans EOF */ mapping_size = roundup_64(i_size_read(inode) - offset, - 1 << inode->i_blkbits); + i_blocksize(inode)); } if (mapping_size > LONG_MAX) mapping_size = LONG_MAX; @@ -1395,7 +1395,7 @@ __xfs_get_blocks( return -EIO; offset = (xfs_off_t)iblock << inode->i_blkbits; - ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); + ASSERT(bh_result->b_size >= i_blocksize(inode)); size = bh_result->b_size; if (!create && direct && offset >= i_size_read(inode)) @@ -1426,6 +1426,26 @@ __xfs_get_blocks( if (error) goto out_unlock; + /* + * The only time we can ever safely find delalloc blocks on direct I/O + * is a dio write to post-eof speculative preallocation. All other + * scenarios are indicative of a problem or misuse (such as mixing + * direct and mapped I/O). + * + * The file may be unmapped by the time we get here so we cannot + * reliably fail the I/O based on mapping. Instead, fail the I/O if this + * is a read or a write within eof. Otherwise, carry on but warn as a + * precuation if the file happens to be mapped. + */ + if (direct && imap.br_startblock == DELAYSTARTBLOCK) { + if (!create || offset < i_size_read(VFS_I(ip))) { + WARN_ON_ONCE(1); + error = -EIO; + goto out_unlock; + } + WARN_ON_ONCE(mapping_mapped(VFS_I(ip)->i_mapping)); + } + /* for DAX, we convert unwritten extents directly */ if (create && (!nimaps || @@ -1525,7 +1545,6 @@ __xfs_get_blocks( set_buffer_new(bh_result); if (imap.br_startblock == DELAYSTARTBLOCK) { - BUG_ON(direct); if (create) { set_buffer_uptodate(bh_result); set_buffer_mapped(bh_result); @@ -1968,7 +1987,7 @@ xfs_vm_set_page_dirty( if (offset < end_offset) set_buffer_dirty(bh); bh = bh->b_this_page; - offset += 1 << inode->i_blkbits; + offset += i_blocksize(inode); } while (bh != head); } /* diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index dd4824589470..234331227c0c 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -112,6 +112,7 @@ typedef struct attrlist_cursor_kern { *========================================================================*/ +/* Return 0 on success, or -errno; other state communicated via *context */ typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, unsigned char *, int, int, unsigned char *); diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 0ef7c2ed3f8a..c8be331a3196 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -108,16 +108,14 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) (int)sfe->namelen, (int)sfe->valuelen, &sfe->nameval[sfe->namelen]); - + if (error) + return error; /* * Either search callback finished early or * didn't fit it all in the buffer after all. */ if (context->seen_enough) break; - - if (error) - return error; sfe = XFS_ATTR_SF_NEXTENTRY(sfe); } trace_xfs_attr_list_sf_all(context); @@ -202,8 +200,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) sbp->namelen, sbp->valuelen, &sbp->name[sbp->namelen]); - if (error) + if (error) { + kmem_free(sbuf); return error; + } if (context->seen_enough) break; cursor->offset++; @@ -454,14 +454,13 @@ xfs_attr3_leaf_list_int( args.rmtblkcnt = xfs_attr3_rmt_blocks( args.dp->i_mount, valuelen); retval = xfs_attr_rmtval_get(&args); - if (retval) - return retval; - retval = context->put_listent(context, - entry->flags, - name_rmt->name, - (int)name_rmt->namelen, - valuelen, - args.value); + if (!retval) + retval = context->put_listent(context, + entry->flags, + name_rmt->name, + (int)name_rmt->namelen, + valuelen, + args.value); kmem_free(args.value); } else { retval = context->put_listent(context, @@ -580,7 +579,7 @@ xfs_attr_put_listent( trace_xfs_attr_list_full(context); alist->al_more = 1; context->seen_enough = 1; - return 1; + return 0; } aep = (attrlist_ent_t *)&context->alist[context->firstu]; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index dbae6490a79a..863e1bff403b 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -682,7 +682,7 @@ xfs_getbmap( * extents. */ if (map[i].br_startblock == DELAYSTARTBLOCK && - map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) ASSERT((iflags & BMV_IF_DELALLOC) != 0); if (map[i].br_startblock == HOLESTARTBLOCK && @@ -1713,6 +1713,7 @@ xfs_swap_extents( xfs_trans_t *tp; xfs_bstat_t *sbp = &sxp->sx_stat; xfs_ifork_t *tempifp, *ifp, *tifp; + xfs_extnum_t nextents; int src_log_flags, target_log_flags; int error = 0; int aforkblks = 0; @@ -1899,7 +1900,8 @@ xfs_swap_extents( * pointer. Otherwise it's already NULL or * pointing to the extent. */ - if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { + nextents = ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents <= XFS_INLINE_EXTS) { ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; } @@ -1918,7 +1920,8 @@ xfs_swap_extents( * pointer. Otherwise it's already NULL or * pointing to the extent. */ - if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { + nextents = tip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents <= XFS_INLINE_EXTS) { tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 39090fc56f09..dcb70969ff1c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -375,6 +375,7 @@ retry: out_free_pages: for (i = 0; i < bp->b_page_count; i++) __free_page(bp->b_pages[i]); + bp->b_flags &= ~_XBF_PAGES; return error; } @@ -978,6 +979,8 @@ void xfs_buf_unlock( struct xfs_buf *bp) { + ASSERT(xfs_buf_islocked(bp)); + XB_CLEAR_OWNER(bp); up(&bp->b_sema); @@ -1535,7 +1538,7 @@ xfs_wait_buftarg( * ensure here that all reference counts have been dropped before we * start walking the LRU list. */ - drain_workqueue(btp->bt_mount->m_buf_workqueue); + flush_workqueue(btp->bt_mount->m_buf_workqueue); /* loop until there is nothing left on the lru list. */ while (list_lru_count(&btp->bt_lru)) { @@ -1712,6 +1715,28 @@ error: } /* + * Cancel a delayed write list. + * + * Remove each buffer from the list, clear the delwri queue flag and drop the + * associated buffer reference. + */ +void +xfs_buf_delwri_cancel( + struct list_head *list) +{ + struct xfs_buf *bp; + + while (!list_empty(list)) { + bp = list_first_entry(list, struct xfs_buf, b_list); + + xfs_buf_lock(bp); + bp->b_flags &= ~_XBF_DELWRI_Q; + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + } +} + +/* * Add a buffer to the delayed write list. * * This queues a buffer for writeout if it hasn't already been. Note that diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c79b717d9b88..149bbd451731 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -132,6 +132,7 @@ struct xfs_buf_map { struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; struct xfs_buf_ops { + char *name; void (*verify_read)(struct xfs_buf *); void (*verify_write)(struct xfs_buf *); }; @@ -303,6 +304,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, extern void *xfs_buf_offset(struct xfs_buf *, size_t); /* Delayed Write Buffer Routines */ +extern void xfs_buf_delwri_cancel(struct list_head *); extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 642d55d10075..2fbf643fa10a 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -406,6 +406,7 @@ xfs_dir2_leaf_readbuf( /* * Do we need more readahead? + * Each loop tries to process 1 full dir blk; last may be partial. */ blk_start_plug(&plug); for (mip->ra_index = mip->ra_offset = i = 0; @@ -416,7 +417,8 @@ xfs_dir2_leaf_readbuf( * Read-ahead a contiguous directory block. */ if (i > mip->ra_current && - map[mip->ra_index].br_blockcount >= geo->fsbcount) { + (map[mip->ra_index].br_blockcount - mip->ra_offset) >= + geo->fsbcount) { xfs_dir3_data_readahead(dp, map[mip->ra_index].br_startoff + mip->ra_offset, XFS_FSB_TO_DADDR(dp->i_mount, @@ -437,14 +439,19 @@ xfs_dir2_leaf_readbuf( } /* - * Advance offset through the mapping table. + * Advance offset through the mapping table, processing a full + * dir block even if it is fragmented into several extents. + * But stop if we have consumed all valid mappings, even if + * it's not yet a full directory block. */ - for (j = 0; j < geo->fsbcount; j += length ) { + for (j = 0; + j < geo->fsbcount && mip->ra_index < mip->map_valid; + j += length ) { /* * The rest of this extent but not more than a dir * block. */ - length = min_t(int, geo->fsbcount, + length = min_t(int, geo->fsbcount - j, map[mip->ra_index].br_blockcount - mip->ra_offset); mip->ra_offset += length; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 74d0e5966ebc..88693a98fac5 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -164,9 +164,9 @@ xfs_verifier_error( { struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx", + xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", - __return_address, bp->b_bn); + __return_address, bp->b_ops->name, bp->b_bn); xfs_alert(mp, "Unmount and run xfs_repair"); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f5392ab2def1..3dd47307363f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -947,7 +947,7 @@ xfs_file_fallocate( if (error) goto out_unlock; } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { - unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + unsigned int blksize_mask = i_blocksize(inode) - 1; if (offset & blksize_mask || len & blksize_mask) { error = -EINVAL; @@ -969,7 +969,7 @@ xfs_file_fallocate( if (error) goto out_unlock; } else if (mode & FALLOC_FL_INSERT_RANGE) { - unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + unsigned int blksize_mask = i_blocksize(inode) - 1; new_size = i_size_read(inode) + len; if (offset & blksize_mask || len & blksize_mask) { @@ -1208,7 +1208,7 @@ xfs_find_get_desired_pgoff( unsigned nr_pages; unsigned int i; - want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); /* @@ -1235,17 +1235,6 @@ xfs_find_get_desired_pgoff( break; } - /* - * At lease we found one page. If this is the first time we - * step into the loop, and if the first page index offset is - * greater than the given search offset, a hole was found. - */ - if (type == HOLE_OFF && lastoff == startoff && - lastoff < page_offset(pvec.pages[0])) { - found = true; - break; - } - for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; loff_t b_offset; @@ -1257,18 +1246,18 @@ xfs_find_get_desired_pgoff( * file mapping. However, page->index will not change * because we have a reference on the page. * - * Searching done if the page index is out of range. - * If the current offset is not reaches the end of - * the specified search range, there should be a hole - * between them. + * If current page offset is beyond where we've ended, + * we've found a hole. */ - if (page->index > end) { - if (type == HOLE_OFF && lastoff < endoff) { - *offset = lastoff; - found = true; - } + if (type == HOLE_OFF && lastoff < endoff && + lastoff < page_offset(pvec.pages[i])) { + found = true; + *offset = lastoff; goto out; } + /* Searching done if the page index is out of range. */ + if (page->index > end) + goto out; lock_page(page); /* diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index ee3aaa0a5317..ca0d3eb44925 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -243,8 +243,8 @@ xfs_growfs_data_private( agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); - agf->agf_flfirst = 0; - agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1); + agf->agf_flfirst = cpu_to_be32(1); + agf->agf_fllast = 0; agf->agf_flcount = 0; tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); agf->agf_freeblks = cpu_to_be32(tmpsize); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index d7a490f24ead..adbc1f59969a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -210,14 +210,17 @@ xfs_iget_cache_hit( error = inode_init_always(mp->m_super, inode); if (error) { + bool wake; /* * Re-initializing the inode failed, and we are in deep * trouble. Try to re-add it to the reclaim list. */ rcu_read_lock(); spin_lock(&ip->i_flags_lock); - + wake = !!__xfs_iflags_test(ip, XFS_INEW); ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + if (wake) + wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); ASSERT(ip->i_flags & XFS_IRECLAIMABLE); trace_xfs_iget_reclaim_fail(ip); goto out_error; @@ -363,6 +366,22 @@ out_destroy: return error; } +static void +xfs_inew_wait( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); + + do { + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (!xfs_iflags_test(ip, XFS_INEW)) + break; + schedule(); + } while (true); + finish_wait(wq, &wait.wait); +} + /* * Look up an inode by number in the given file system. * The inode is looked up in the cache held in each AG. @@ -467,9 +486,11 @@ out_error_or_again: STATIC int xfs_inode_ag_walk_grab( - struct xfs_inode *ip) + struct xfs_inode *ip, + int flags) { struct inode *inode = VFS_I(ip); + bool newinos = !!(flags & XFS_AGITER_INEW_WAIT); ASSERT(rcu_read_lock_held()); @@ -487,7 +508,8 @@ xfs_inode_ag_walk_grab( goto out_unlock_noent; /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || + __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) goto out_unlock_noent; spin_unlock(&ip->i_flags_lock); @@ -515,7 +537,8 @@ xfs_inode_ag_walk( void *args), int flags, void *args, - int tag) + int tag, + int iter_flags) { uint32_t first_index; int last_error = 0; @@ -557,7 +580,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || xfs_inode_ag_walk_grab(ip)) + if (done || xfs_inode_ag_walk_grab(ip, iter_flags)) batch[i] = NULL; /* @@ -585,6 +608,9 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; + if ((iter_flags & XFS_AGITER_INEW_WAIT) && + xfs_iflags_test(batch[i], XFS_INEW)) + xfs_inew_wait(batch[i]); error = execute(batch[i], flags, args); IRELE(batch[i]); if (error == -EAGAIN) { @@ -637,12 +663,13 @@ xfs_eofblocks_worker( } int -xfs_inode_ag_iterator( +xfs_inode_ag_iterator_flags( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, - void *args) + void *args, + int iter_flags) { struct xfs_perag *pag; int error = 0; @@ -652,7 +679,8 @@ xfs_inode_ag_iterator( ag = 0; while ((pag = xfs_perag_get(mp, ag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1, + iter_flags); xfs_perag_put(pag); if (error) { last_error = error; @@ -664,6 +692,17 @@ xfs_inode_ag_iterator( } int +xfs_inode_ag_iterator( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int flags, + void *args) +{ + return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0); +} + +int xfs_inode_ag_iterator_tag( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, @@ -680,7 +719,8 @@ xfs_inode_ag_iterator_tag( ag = 0; while ((pag = xfs_perag_get_tag(mp, ag, tag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag, + 0); xfs_perag_put(pag); if (error) { last_error = error; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 62f1f91c32cb..147a79212e63 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -48,6 +48,11 @@ struct xfs_eofblocks { #define XFS_IGET_UNTRUSTED 0x2 #define XFS_IGET_DONTCACHE 0x4 +/* + * flags for AG inode iterator + */ +#define XFS_AGITER_INEW_WAIT 0x1 /* wait on new inodes */ + int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); @@ -72,6 +77,9 @@ void xfs_eofblocks_worker(struct work_struct *); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args); +int xfs_inode_ag_iterator_flags(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, void *args), + int flags, void *args, int iter_flags); int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args, int tag); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8ee393996b7d..f0ce28cd311d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3220,13 +3220,14 @@ xfs_iflush_cluster( * We need to check under the i_flags_lock for a valid inode * here. Skip it if it is not valid or the wrong inode. */ - spin_lock(&ip->i_flags_lock); - if (!ip->i_ino || + spin_lock(&iq->i_flags_lock); + if (!iq->i_ino || + __xfs_iflags_test(iq, XFS_ISTALE) || (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { - spin_unlock(&ip->i_flags_lock); + spin_unlock(&iq->i_flags_lock); continue; } - spin_unlock(&ip->i_flags_lock); + spin_unlock(&iq->i_flags_lock); /* * Do an un-protected check to see if the inode is dirty and @@ -3342,7 +3343,7 @@ xfs_iflush( struct xfs_buf **bpp) { struct xfs_mount *mp = ip->i_mount; - struct xfs_buf *bp; + struct xfs_buf *bp = NULL; struct xfs_dinode *dip; int error; @@ -3384,14 +3385,22 @@ xfs_iflush( } /* - * Get the buffer containing the on-disk inode. + * Get the buffer containing the on-disk inode. We are doing a try-lock + * operation here, so we may get an EAGAIN error. In that case, we + * simply want to return with the inode still dirty. + * + * If we get any other error, we effectively have a corruption situation + * and we cannot flush the inode, so we treat it the same as failing + * xfs_iflush_int(). */ error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, 0); - if (error || !bp) { + if (error == -EAGAIN) { xfs_ifunlock(ip); return error; } + if (error) + goto corrupt_out; /* * First flush out the inode that xfs_iflush was called with. @@ -3419,7 +3428,8 @@ xfs_iflush( return 0; corrupt_out: - xfs_buf_relse(bp); + if (bp) + xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); cluster_corrupt_out: error = -EFSCORRUPTED; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index ca9e11989cbd..ae1a49845744 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -208,7 +208,8 @@ xfs_get_initial_prid(struct xfs_inode *dp) #define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ #define XFS_ISTALE (1 << 1) /* inode has been staled */ #define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ -#define XFS_INEW (1 << 3) /* inode has just been allocated */ +#define __XFS_INEW_BIT 3 /* inode has just been allocated */ +#define XFS_INEW (1 << __XFS_INEW_BIT) #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ #define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ @@ -453,6 +454,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip) xfs_iflags_clear(ip, XFS_INEW); barrier(); unlock_new_inode(VFS_I(ip)); + wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); } static inline void xfs_setup_existing_inode(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d42738deec6d..e4a4f82ea13f 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -403,6 +403,7 @@ xfs_attrlist_by_handle( { int error = -ENOMEM; attrlist_cursor_kern_t *cursor; + struct xfs_fsop_attrlist_handlereq __user *p = arg; xfs_fsop_attrlist_handlereq_t al_hreq; struct dentry *dentry; char *kbuf; @@ -435,6 +436,11 @@ xfs_attrlist_by_handle( if (error) goto out_kfree; + if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) { + error = -EFAULT; + goto out_kfree; + } + if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) error = -EFAULT; @@ -1379,10 +1385,11 @@ xfs_ioc_getbmap( unsigned int cmd, void __user *arg) { - struct getbmapx bmx; + struct getbmapx bmx = { 0 }; int error; - if (copy_from_user(&bmx, arg, sizeof(struct getbmapx))) + /* struct getbmap is a strict subset of struct getbmapx. */ + if (copy_from_user(&bmx, arg, offsetof(struct getbmapx, bmv_iflags))) return -EFAULT; if (bmx.bmv_count < 2) diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index ec0e239a0fa9..201aae0b2662 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -369,7 +369,14 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) #endif /* DEBUG */ #ifdef CONFIG_XFS_RT -#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) + +/* + * make sure we ignore the inode flag if the filesystem doesn't have a + * configured realtime device. + */ +#define XFS_IS_REALTIME_INODE(ip) \ + (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) && \ + (ip)->i_mount->m_rtdev_targp) #else #define XFS_IS_REALTIME_INODE(ip) (0) #endif diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index c5ecaacdd218..8cab78eeb0c2 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3204,6 +3204,7 @@ xlog_recover_dquot_ra_pass2( struct xfs_disk_dquot *recddq; struct xfs_dq_logformat *dq_f; uint type; + int len; if (mp->m_qflags == 0) @@ -3224,8 +3225,12 @@ xlog_recover_dquot_ra_pass2( ASSERT(dq_f); ASSERT(dq_f->qlf_len == 1); - xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL); + len = XFS_FSB_TO_BB(mp, dq_f->qlf_len); + if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0)) + return; + + xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len, + &xfs_dquot_buf_ra_ops); } STATIC void @@ -3975,6 +3980,7 @@ xlog_recover_clear_agi_bucket( agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket); + xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 532ab79d38fe..572b64a135b3 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1355,12 +1355,7 @@ xfs_qm_quotacheck( mp->m_qflags |= flags; error_return: - while (!list_empty(&buffer_list)) { - struct xfs_buf *bp = - list_first_entry(&buffer_list, struct xfs_buf, b_list); - list_del_init(&bp->b_list); - xfs_buf_relse(bp); - } + xfs_buf_delwri_cancel(&buffer_list); if (error) { xfs_warn(mp, diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 3640c6e896af..4d334440bd94 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -764,5 +764,6 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL); + xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL, + XFS_AGITER_INEW_WAIT); } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 36bd8825bfb0..ef64a1e1a66a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1233,6 +1233,16 @@ xfs_fs_remount( return -EINVAL; } + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_ro_compat_feature(sbp, + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_warn(mp, +"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem", + (sbp->sb_features_ro_compat & + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); + return -EINVAL; + } + mp->m_flags &= ~XFS_MOUNT_RDONLY; /* diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 839b35ca21c6..9beaf192b4bb 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -180,7 +180,8 @@ xfs_xattr_put_listent( arraytop = context->count + prefix_len + namelen + 1; if (arraytop > context->firstu) { context->count = -1; /* insufficient space */ - return 1; + context->seen_enough = 1; + return 0; } offset = (char *)context->alist + context->count; strncpy(offset, xfs_xattr_prefix(flags), prefix_len); @@ -222,12 +223,15 @@ list_one_attr(const char *name, const size_t len, void *data, } ssize_t -xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) +xfs_vn_listxattr( + struct dentry *dentry, + char *data, + size_t size) { struct xfs_attr_list_context context; struct attrlist_cursor_kern cursor = { 0 }; - struct inode *inode = d_inode(dentry); - int error; + struct inode *inode = d_inode(dentry); + int error; /* * First read the regular on-disk attributes. @@ -245,7 +249,9 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) else context.put_listent = xfs_xattr_put_listent_sizes; - xfs_attr_list_int(&context); + error = xfs_attr_list_int(&context); + if (error) + return error; if (context.count < 0) return -ERANGE; |