aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/dir.c9
-rw-r--r--fs/aio.c115
-rw-r--r--fs/autofs4/dev-ioctl.c8
-rw-r--r--fs/binfmt_elf.c5
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.c17
-rw-r--r--fs/btrfs/delayed-inode.c8
-rw-r--r--fs/btrfs/disk-io.c17
-rw-r--r--fs/btrfs/extent-tree.c10
-rw-r--r--fs/btrfs/extent_io.c11
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/file-item.c2
-rw-r--r--fs/btrfs/file.c91
-rw-r--r--fs/btrfs/inode.c4
-rw-r--r--fs/btrfs/ioctl.c29
-rw-r--r--fs/btrfs/relocation.c93
-rw-r--r--fs/btrfs/send.c4
-rw-r--r--fs/btrfs/transaction.c12
-rw-r--r--fs/btrfs/tree-log.c13
-rw-r--r--fs/buffer.c42
-rw-r--r--fs/cachefiles/bind.c1
-rw-r--r--fs/cachefiles/namei.c7
-rw-r--r--fs/ceph/addr.c2
-rw-r--r--fs/ceph/mds_client.c21
-rw-r--r--fs/cifs/cifsglob.h2
-rw-r--r--fs/cifs/file.c9
-rw-r--r--fs/cifs/ioctl.c21
-rw-r--r--fs/cifs/link.c8
-rw-r--r--fs/cifs/readdir.c2
-rw-r--r--fs/cifs/smb1ops.c9
-rw-r--r--fs/cifs/smb2maperror.c4
-rw-r--r--fs/cifs/smb2ops.c12
-rw-r--r--fs/cifs/smb2pdu.c9
-rw-r--r--fs/compat.c21
-rw-r--r--fs/cramfs/inode.c3
-rw-r--r--fs/dcache.c55
-rw-r--r--fs/debugfs/inode.c34
-rw-r--r--fs/ecryptfs/crypto.c1
-rw-r--r--fs/ecryptfs/file.c18
-rw-r--r--fs/ecryptfs/inode.c4
-rw-r--r--fs/ecryptfs/main.c23
-rw-r--r--fs/eventpoll.c3
-rw-r--r--fs/exec.c5
-rw-r--r--fs/exportfs/expfs.c5
-rw-r--r--fs/ext3/super.c7
-rw-r--r--fs/ext4/bitmap.c12
-rw-r--r--fs/ext4/ext4.h13
-rw-r--r--fs/ext4/extents.c6
-rw-r--r--fs/ext4/file.c8
-rw-r--r--fs/ext4/ialloc.c7
-rw-r--r--fs/ext4/inline.c3
-rw-r--r--fs/ext4/inode.c39
-rw-r--r--fs/ext4/ioctl.c13
-rw-r--r--fs/ext4/mballoc.c14
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/namei.c55
-rw-r--r--fs/ext4/resize.c5
-rw-r--r--fs/ext4/super.c32
-rw-r--r--fs/ext4/xattr.c38
-rw-r--r--fs/f2fs/checkpoint.c1
-rw-r--r--fs/f2fs/node.c2
-rw-r--r--fs/fat/dir.c5
-rw-r--r--fs/fs-writeback.c29
-rw-r--r--fs/fuse/dev.c9
-rw-r--r--fs/fuse/dir.c197
-rw-r--r--fs/fuse/file.c105
-rw-r--r--fs/fuse/fuse_i.h12
-rw-r--r--fs/fuse/inode.c30
-rw-r--r--fs/gfs2/aops.c1
-rw-r--r--fs/gfs2/export.c8
-rw-r--r--fs/gfs2/inode.c5
-rw-r--r--fs/gfs2/meta_io.c9
-rw-r--r--fs/gfs2/meta_io.h3
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/hfsplus/brec.c20
-rw-r--r--fs/hppfs/hppfs.c5
-rw-r--r--fs/hugetlbfs/inode.c5
-rw-r--r--fs/internal.h7
-rw-r--r--fs/ioprio.c14
-rw-r--r--fs/isofs/rock.c9
-rw-r--r--fs/jbd2/recovery.c1
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/jffs2_fs_sb.h2
-rw-r--r--fs/jffs2/scan.c5
-rw-r--r--fs/jffs2/wbuf.c17
-rw-r--r--fs/lockd/mon.c6
-rw-r--r--fs/lockd/svc.c12
-rw-r--r--fs/locks.c24
-rw-r--r--fs/namei.c380
-rw-r--r--fs/namespace.c45
-rw-r--r--fs/ncpfs/ioctl.c1
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/callback.c8
-rw-r--r--fs/nfs/callback_xdr.c4
-rw-r--r--fs/nfs/delegation.c27
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/direct.c7
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/nfs4client.c57
-rw-r--r--fs/nfs/nfs4proc.c106
-rw-r--r--fs/nfs/nfs4renewd.c12
-rw-r--r--fs/nfs/nfs4state.c18
-rw-r--r--fs/nfs/pagelist.c2
-rw-r--r--fs/nfsd/nfs4callback.c8
-rw-r--r--fs/nfsd/nfs4proc.c3
-rw-r--r--fs/nfsd/nfs4recover.c5
-rw-r--r--fs/nfsd/nfs4state.c15
-rw-r--r--fs/nfsd/nfs4xdr.c3
-rw-r--r--fs/nfsd/nfscache.c17
-rw-r--r--fs/nfsd/nfsd.h9
-rw-r--r--fs/nfsd/vfs.c14
-rw-r--r--fs/nfsd/vfs.h4
-rw-r--r--fs/nilfs2/btree.c47
-rw-r--r--fs/nilfs2/inode.c39
-rw-r--r--fs/nilfs2/namei.c15
-rw-r--r--fs/nilfs2/nilfs.h2
-rw-r--r--fs/nilfs2/segment.c45
-rw-r--r--fs/nilfs2/segment.h5
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/notify/fdinfo.c4
-rw-r--r--fs/notify/inode_mark.c17
-rw-r--r--fs/ntfs/attrib.c1
-rw-r--r--fs/ntfs/file.c1
-rw-r--r--fs/ocfs2/aops.c16
-rw-r--r--fs/ocfs2/dir.c8
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c18
-rw-r--r--fs/ocfs2/file.c14
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/namei.c43
-rw-r--r--fs/open.c23
-rw-r--r--fs/overlayfs/Kconfig10
-rw-r--r--fs/overlayfs/Makefile7
-rw-r--r--fs/overlayfs/copy_up.c413
-rw-r--r--fs/overlayfs/dir.c928
-rw-r--r--fs/overlayfs/inode.c436
-rw-r--r--fs/overlayfs/overlayfs.h199
-rw-r--r--fs/overlayfs/readdir.c557
-rw-r--r--fs/overlayfs/super.c1046
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/base.c53
-rw-r--r--fs/proc/generic.c12
-rw-r--r--fs/proc/inode.c21
-rw-r--r--fs/proc/internal.h8
-rw-r--r--fs/proc/namespaces.c14
-rw-r--r--fs/proc/proc_devtree.c241
-rw-r--r--fs/proc/root.c3
-rw-r--r--fs/proc/self.c2
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/proc/task_mmu.c6
-rw-r--r--fs/pstore/inode.c4
-rw-r--r--fs/pstore/ram.c28
-rw-r--r--fs/pstore/ram_core.c31
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/readdir.c21
-rw-r--r--fs/reiserfs/xattr.c15
-rw-r--r--fs/splice.c1
-rw-r--r--fs/super.c18
-rw-r--r--fs/ubifs/commit.c10
-rw-r--r--fs/ubifs/log.c19
-rw-r--r--fs/ubifs/master.c7
-rw-r--r--fs/ubifs/super.c1
-rw-r--r--fs/ubifs/ubifs.h2
-rw-r--r--fs/udf/inode.c35
-rw-r--r--fs/udf/symlink.c17
-rw-r--r--fs/xfs/xfs_aops.c16
-rw-r--r--fs/xfs/xfs_buf_item.c4
-rw-r--r--fs/xfs/xfs_inode.c2
-rw-r--r--fs/xfs/xfs_ioctl.c28
-rw-r--r--fs/xfs/xfs_mount.c1
-rw-r--r--fs/xfs/xfs_qm.c5
-rw-r--r--fs/xfs/xfs_sb.c24
-rw-r--r--fs/xfs/xfs_trans.c1
174 files changed, 5603 insertions, 1286 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 7385e54be4b9..6b40fd8d719e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -67,6 +67,7 @@ source "fs/quota/Kconfig"
source "fs/autofs4/Kconfig"
source "fs/fuse/Kconfig"
+source "fs/overlayfs/Kconfig"
menu "Caches"
diff --git a/fs/Makefile b/fs/Makefile
index 47ac07bb4acc..b5954404e02d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -104,6 +104,7 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6/
obj-$(CONFIG_AUTOFS4_FS) += autofs4/
obj-$(CONFIG_ADFS_FS) += adfs/
obj-$(CONFIG_FUSE_FS) += fuse/
+obj-$(CONFIG_OVERLAY_FS) += overlayfs/
obj-$(CONFIG_UDF_FS) += udf/
obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
obj-$(CONFIG_OMFS_FS) += omfs/
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 529300327f45..5479af6de6ce 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -26,7 +26,7 @@ static int afs_readdir(struct file *file, struct dir_context *ctx);
static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
static int afs_d_delete(const struct dentry *dentry);
static void afs_d_release(struct dentry *dentry);
-static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
+static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen,
loff_t fpos, u64 ino, unsigned dtype);
static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl);
@@ -391,10 +391,11 @@ static int afs_readdir(struct file *file, struct dir_context *ctx)
* - if afs_dir_iterate_block() spots this function, it'll pass the FID
* uniquifier through dtype
*/
-static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
- loff_t fpos, u64 ino, unsigned dtype)
+static int afs_lookup_filldir(struct dir_context *ctx, const char *name,
+ int nlen, loff_t fpos, u64 ino, unsigned dtype)
{
- struct afs_lookup_cookie *cookie = _cookie;
+ struct afs_lookup_cookie *cookie =
+ container_of(ctx, struct afs_lookup_cookie, ctx);
_enter("{%s,%u},%s,%u,,%llu,%u",
cookie->name.name, cookie->name.len, name, nlen,
diff --git a/fs/aio.c b/fs/aio.c
index 6d68e01dc7ca..3241659491b1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -141,6 +141,7 @@ struct kioctx {
struct {
unsigned tail;
+ unsigned completed_events;
spinlock_t completion_lock;
} ____cacheline_aligned_in_smp;
@@ -164,6 +165,15 @@ static struct vfsmount *aio_mnt;
static const struct file_operations aio_ring_fops;
static const struct address_space_operations aio_ctx_aops;
+/* Backing dev info for aio fs.
+ * -no dirty page accounting or writeback happens
+ */
+static struct backing_dev_info aio_fs_backing_dev_info = {
+ .name = "aiofs",
+ .state = 0,
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
+};
+
static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
{
struct qstr this = QSTR_INIT("[aio]", 5);
@@ -175,6 +185,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
inode->i_mapping->a_ops = &aio_ctx_aops;
inode->i_mapping->private_data = ctx;
+ inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
inode->i_size = PAGE_SIZE * nr_pages;
path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
@@ -220,6 +231,9 @@ static int __init aio_setup(void)
if (IS_ERR(aio_mnt))
panic("Failed to create aio fs mount.");
+ if (bdi_init(&aio_fs_backing_dev_info))
+ panic("Failed to init aio fs backing dev info.");
+
kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
@@ -281,11 +295,6 @@ static const struct file_operations aio_ring_fops = {
.mmap = aio_ring_mmap,
};
-static int aio_set_page_dirty(struct page *page)
-{
- return 0;
-}
-
#if IS_ENABLED(CONFIG_MIGRATION)
static int aio_migratepage(struct address_space *mapping, struct page *new,
struct page *old, enum migrate_mode mode)
@@ -357,7 +366,7 @@ out:
#endif
static const struct address_space_operations aio_ctx_aops = {
- .set_page_dirty = aio_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_no_writeback,
#if IS_ENABLED(CONFIG_MIGRATION)
.migratepage = aio_migratepage,
#endif
@@ -412,7 +421,6 @@ static int aio_setup_ring(struct kioctx *ctx)
pr_debug("pid(%d) page[%d]->count=%d\n",
current->pid, i, page_count(page));
SetPageUptodate(page);
- SetPageDirty(page);
unlock_page(page);
ctx->ring_pages[i] = page;
@@ -711,6 +719,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
err_cleanup:
aio_nr_sub(ctx->max_reqs);
err_ctx:
+ atomic_set(&ctx->dead, 1);
+ if (ctx->mmap_size)
+ vm_munmap(ctx->mmap_base, ctx->mmap_size);
aio_free_ring(ctx);
err:
mutex_unlock(&ctx->ring_lock);
@@ -796,6 +807,9 @@ void exit_aio(struct mm_struct *mm)
unsigned i = 0;
while (1) {
+ struct completion requests_done =
+ COMPLETION_INITIALIZER_ONSTACK(requests_done);
+
rcu_read_lock();
table = rcu_dereference(mm->ioctx_table);
@@ -823,7 +837,10 @@ void exit_aio(struct mm_struct *mm)
*/
ctx->mmap_size = 0;
- kill_ioctx(mm, ctx, NULL);
+ kill_ioctx(mm, ctx, &requests_done);
+
+ /* Wait until all IO for the context are done. */
+ wait_for_completion(&requests_done);
}
}
@@ -880,6 +897,68 @@ out:
return ret;
}
+/* refill_reqs_available
+ * Updates the reqs_available reference counts used for tracking the
+ * number of free slots in the completion ring. This can be called
+ * from aio_complete() (to optimistically update reqs_available) or
+ * from aio_get_req() (the we're out of events case). It must be
+ * called holding ctx->completion_lock.
+ */
+static void refill_reqs_available(struct kioctx *ctx, unsigned head,
+ unsigned tail)
+{
+ unsigned events_in_ring, completed;
+
+ /* Clamp head since userland can write to it. */
+ head %= ctx->nr_events;
+ if (head <= tail)
+ events_in_ring = tail - head;
+ else
+ events_in_ring = ctx->nr_events - (head - tail);
+
+ completed = ctx->completed_events;
+ if (events_in_ring < completed)
+ completed -= events_in_ring;
+ else
+ completed = 0;
+
+ if (!completed)
+ return;
+
+ ctx->completed_events -= completed;
+ put_reqs_available(ctx, completed);
+}
+
+/* user_refill_reqs_available
+ * Called to refill reqs_available when aio_get_req() encounters an
+ * out of space in the completion ring.
+ */
+static void user_refill_reqs_available(struct kioctx *ctx)
+{
+ spin_lock_irq(&ctx->completion_lock);
+ if (ctx->completed_events) {
+ struct aio_ring *ring;
+ unsigned head;
+
+ /* Access of ring->head may race with aio_read_events_ring()
+ * here, but that's okay since whether we read the old version
+ * or the new version, and either will be valid. The important
+ * part is that head cannot pass tail since we prevent
+ * aio_complete() from updating tail by holding
+ * ctx->completion_lock. Even if head is invalid, the check
+ * against ctx->completed_events below will make sure we do the
+ * safe/right thing.
+ */
+ ring = kmap_atomic(ctx->ring_pages[0]);
+ head = ring->head;
+ kunmap_atomic(ring);
+
+ refill_reqs_available(ctx, head, ctx->tail);
+ }
+
+ spin_unlock_irq(&ctx->completion_lock);
+}
+
/* aio_get_req
* Allocate a slot for an aio request.
* Returns NULL if no requests are free.
@@ -888,8 +967,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
{
struct kiocb *req;
- if (!get_reqs_available(ctx))
- return NULL;
+ if (!get_reqs_available(ctx)) {
+ user_refill_reqs_available(ctx);
+ if (!get_reqs_available(ctx))
+ return NULL;
+ }
req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
if (unlikely(!req))
@@ -948,8 +1030,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
struct kioctx *ctx = iocb->ki_ctx;
struct aio_ring *ring;
struct io_event *ev_page, *event;
+ unsigned tail, pos, head;
unsigned long flags;
- unsigned tail, pos;
/*
* Special case handling for sync iocbs:
@@ -1010,10 +1092,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
ctx->tail = tail;
ring = kmap_atomic(ctx->ring_pages[0]);
+ head = ring->head;
ring->tail = tail;
kunmap_atomic(ring);
flush_dcache_page(ctx->ring_pages[0]);
+ ctx->completed_events++;
+ if (ctx->completed_events > 1)
+ refill_reqs_available(ctx, head, tail);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
pr_debug("added to ring %p at [%u]\n", iocb, tail);
@@ -1028,7 +1114,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
/* everything turned out well, dispose of the aiocb. */
kiocb_free(iocb);
- put_reqs_available(ctx, 1);
/*
* We have to order our ring_info tail store above and test
@@ -1065,6 +1150,12 @@ static long aio_read_events_ring(struct kioctx *ctx,
tail = ring->tail;
kunmap_atomic(ring);
+ /*
+ * Ensure that once we've read the current tail pointer, that
+ * we also see the events that were stored up to the tail.
+ */
+ smp_rmb();
+
pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
if (head == tail)
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 3182c0e68b42..e3399dc2453b 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
*/
static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
{
- struct autofs_dev_ioctl tmp;
+ struct autofs_dev_ioctl tmp, *res;
if (copy_from_user(&tmp, in, sizeof(tmp)))
return ERR_PTR(-EFAULT);
@@ -103,7 +103,11 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
if (tmp.size < sizeof(tmp))
return ERR_PTR(-EINVAL);
- return memdup_user(in, tmp.size);
+ res = memdup_user(in, tmp.size);
+ if (!IS_ERR(res))
+ res->size = tmp.size;
+
+ return res;
}
static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 67be2951b98a..f4d7b2fc9ffb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -549,11 +549,12 @@ out:
static unsigned long randomize_stack_top(unsigned long stack_top)
{
- unsigned int random_variable = 0;
+ unsigned long random_variable = 0;
if ((current->flags & PF_RANDOMIZE) &&
!(current->personality & ADDR_NO_RANDOMIZE)) {
- random_variable = get_random_int() & STACK_RND_MASK;
+ random_variable = (unsigned long) get_random_int();
+ random_variable &= STACK_RND_MASK;
random_variable <<= PAGE_SHIFT;
}
#ifdef CONFIG_STACK_GROWSUP
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b01fb6c527e3..d43c544d3b68 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -472,7 +472,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, pg_index);
rcu_read_unlock();
- if (page) {
+ if (page && !radix_tree_exceptional_entry(page)) {
misses++;
if (misses > 4)
break;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cbd3a7d6fa68..93de3ba994e7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2655,32 +2655,23 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key,
return 0;
}
-int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path,
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
u64 iobjectid, u64 ioff, u8 key_type,
struct btrfs_key *found_key)
{
int ret;
struct btrfs_key key;
struct extent_buffer *eb;
- struct btrfs_path *path;
+
+ ASSERT(path);
key.type = key_type;
key.objectid = iobjectid;
key.offset = ioff;
- if (found_path == NULL) {
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- } else
- path = found_path;
-
ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
- if ((ret < 0) || (found_key == NULL)) {
- if (path != found_path)
- btrfs_free_path(path);
+ if ((ret < 0) || (found_key == NULL))
return ret;
- }
eb = path->nodes[0];
if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 451b00c86f6c..12e35566d2fc 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1854,6 +1854,14 @@ int btrfs_delayed_delete_inode_ref(struct inode *inode)
{
struct btrfs_delayed_node *delayed_node;
+ /*
+ * we don't do delayed inode updates during log recovery because it
+ * leads to enospc problems. This means we also can't do
+ * delayed inode refs
+ */
+ if (BTRFS_I(inode)->root->fs_info->log_root_recovering)
+ return -EAGAIN;
+
delayed_node = btrfs_get_or_create_delayed_node(inode);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 370ef7450157..f48d5fc352a9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1560,6 +1560,7 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
bool check_ref)
{
struct btrfs_root *root;
+ struct btrfs_path *path;
int ret;
if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
@@ -1599,8 +1600,14 @@ again:
if (ret)
goto fail;
- ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID,
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ ret = btrfs_find_item(fs_info->tree_root, path, BTRFS_ORPHAN_OBJECTID,
location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL);
+ btrfs_free_path(path);
if (ret < 0)
goto fail;
if (ret == 0)
@@ -2411,7 +2418,7 @@ int open_ctree(struct super_block *sb,
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
- printk(KERN_ERR "BTRFS: has skinny extents\n");
+ printk(KERN_INFO "BTRFS: has skinny extents\n");
/*
* flag our filesystem as having big metadata blocks if
@@ -3978,12 +3985,6 @@ again:
if (ret)
break;
- /* opt_discard */
- if (btrfs_test_opt(root, DISCARD))
- ret = btrfs_error_discard_extent(root, start,
- end + 1 - start,
- NULL);
-
clear_extent_dirty(unpin, start, end, GFP_NOFS);
btrfs_error_unpin_extent_range(root, start, end);
cond_resched();
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3ff98e23f651..d2f1c011d73a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5503,7 +5503,8 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
update_global_block_rsv(fs_info);
}
-static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
+ const bool return_free_space)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache = NULL;
@@ -5527,7 +5528,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
if (start < cache->last_byte_to_unpin) {
len = min(len, cache->last_byte_to_unpin - start);
- btrfs_add_free_space(cache, start, len);
+ if (return_free_space)
+ btrfs_add_free_space(cache, start, len);
}
start += len;
@@ -5590,7 +5592,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
end + 1 - start, NULL);
clear_extent_dirty(unpin, start, end, GFP_NOFS);
- unpin_extent_range(root, start, end);
+ unpin_extent_range(root, start, end, true);
cond_resched();
}
@@ -8886,7 +8888,7 @@ out:
int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
{
- return unpin_extent_range(root, start, end);
+ return unpin_extent_range(root, start, end, false);
}
int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1a858947006e..fa9f90049099 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4507,7 +4507,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
}
-static void mark_extent_buffer_accessed(struct extent_buffer *eb)
+static void mark_extent_buffer_accessed(struct extent_buffer *eb,
+ struct page *accessed)
{
unsigned long num_pages, i;
@@ -4516,7 +4517,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb)
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
struct page *p = extent_buffer_page(eb, i);
- mark_page_accessed(p);
+ if (p != accessed)
+ mark_page_accessed(p);
}
}
@@ -4530,7 +4532,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
start >> PAGE_CACHE_SHIFT);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
- mark_extent_buffer_accessed(eb);
+ mark_extent_buffer_accessed(eb, NULL);
return eb;
}
rcu_read_unlock();
@@ -4578,7 +4580,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
spin_unlock(&mapping->private_lock);
unlock_page(p);
page_cache_release(p);
- mark_extent_buffer_accessed(exists);
+ mark_extent_buffer_accessed(exists, p);
goto free_eb;
}
@@ -4593,7 +4595,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
attach_extent_buffer_page(eb, p);
spin_unlock(&mapping->private_lock);
WARN_ON(PageDirty(p));
- mark_page_accessed(p);
eb->pages[i] = p;
if (!PageUptodate(p))
uptodate = 0;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 996ad56b57db..82845a6c63c2 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -290,8 +290,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
if (!em)
goto out;
- if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
- list_move(&em->list, &tree->modified_extents);
em->generation = gen;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
em->mod_start = em->start;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index ca248b0687f4..196b089b0052 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -423,7 +423,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
ret = 0;
fail:
while (ret < 0 && !list_empty(&tmplist)) {
- sums = list_entry(&tmplist, struct btrfs_ordered_sum, list);
+ sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list);
list_del(&sums->list);
kfree(sums);
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0165b8672f09..0a841ddd6843 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -425,13 +425,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
struct page *page = prepared_pages[pg];
/*
* Copy data from userspace to the current page
- *
- * Disable pagefault to avoid recursive lock since
- * the pages are already locked
*/
- pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
- pagefault_enable();
/* Flush processor's dcache for this page */
flush_dcache_page(page);
@@ -475,11 +470,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
for (i = 0; i < num_pages; i++) {
/* page checked is some magic around finding pages that
* have been modified without going through btrfs_set_page_dirty
- * clear it here
+ * clear it here. There should be no need to mark the pages
+ * accessed as prepare_pages should have marked them accessed
+ * in prepare_pages via find_or_create_page()
*/
ClearPageChecked(pages[i]);
unlock_page(pages[i]);
- mark_page_accessed(pages[i]);
page_cache_release(pages[i]);
}
}
@@ -1778,22 +1774,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
mutex_unlock(&inode->i_mutex);
/*
- * we want to make sure fsync finds this change
- * but we haven't joined a transaction running right now.
- *
- * Later on, someone is sure to update the inode and get the
- * real transid recorded.
- *
- * We set last_trans now to the fs_info generation + 1,
- * this will either be one more than the running transaction
- * or the generation used for the next transaction if there isn't
- * one running right now.
- *
* We also have to set last_sub_trans to the current log transid,
* otherwise subsequent syncs to a file that's been synced in this
* transaction will appear to have already occured.
*/
- BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
BTRFS_I(inode)->last_sub_trans = root->log_transid;
if (num_written > 0) {
err = generic_write_sync(file, pos, num_written);
@@ -1896,25 +1880,37 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
- * check the transaction that last modified this inode
- * and see if its already been committed
- */
- if (!BTRFS_I(inode)->last_trans) {
- mutex_unlock(&inode->i_mutex);
- goto out;
- }
-
- /*
- * if the last transaction that changed this file was before
- * the current transaction, we can bail out now without any
- * syncing
+ * If the last transaction that changed this file was before the current
+ * transaction and we have the full sync flag set in our inode, we can
+ * bail out now without any syncing.
+ *
+ * Note that we can't bail out if the full sync flag isn't set. This is
+ * because when the full sync flag is set we start all ordered extents
+ * and wait for them to fully complete - when they complete they update
+ * the inode's last_trans field through:
+ *
+ * btrfs_finish_ordered_io() ->
+ * btrfs_update_inode_fallback() ->
+ * btrfs_update_inode() ->
+ * btrfs_set_inode_last_trans()
+ *
+ * So we are sure that last_trans is up to date and can do this check to
+ * bail out safely. For the fast path, when the full sync flag is not
+ * set in our inode, we can not do it because we start only our ordered
+ * extents and don't wait for them to complete (that is when
+ * btrfs_finish_ordered_io runs), so here at this point their last_trans
+ * value might be less than or equals to fs_info->last_trans_committed,
+ * and setting a speculative last_trans for an inode when a buffered
+ * write is made (such as fs_info->generation + 1 for example) would not
+ * be reliable since after setting the value and before fsync is called
+ * any number of transactions can start and commit (transaction kthread
+ * commits the current transaction periodically), and a transaction
+ * commit does not start nor waits for ordered extents to complete.
*/
smp_mb();
if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
- BTRFS_I(inode)->last_trans <=
- root->fs_info->last_trans_committed) {
- BTRFS_I(inode)->last_trans = 0;
-
+ (full_sync && BTRFS_I(inode)->last_trans <=
+ root->fs_info->last_trans_committed)) {
/*
* We'v had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
@@ -2510,23 +2506,28 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
- u64 lockstart = *offset;
- u64 lockend = i_size_read(inode);
- u64 start = *offset;
- u64 len = i_size_read(inode);
+ u64 lockstart;
+ u64 lockend;
+ u64 start;
+ u64 len;
int ret = 0;
- lockend = max_t(u64, root->sectorsize, lockend);
+ if (inode->i_size == 0)
+ return -ENXIO;
+
+ /*
+ * *offset can be negative, in this case we start finding DATA/HOLE from
+ * the very start of the file.
+ */
+ start = max_t(loff_t, 0, *offset);
+
+ lockstart = round_down(start, root->sectorsize);
+ lockend = round_up(i_size_read(inode), root->sectorsize);
if (lockend <= lockstart)
lockend = lockstart + root->sectorsize;
-
lockend--;
len = lockend - lockstart + 1;
- len = max_t(u64, len, root->sectorsize);
- if (inode->i_size == 0)
- return -ENXIO;
-
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
&cached_state);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c69c76351f12..653cdd85e0f2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3596,7 +3596,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
* without delay
*/
if (!btrfs_is_free_space_inode(inode)
- && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+ && !root->fs_info->log_root_recovering) {
btrfs_update_root_times(trans, root);
ret = btrfs_delayed_update_inode(trans, root, inode);
@@ -6869,7 +6870,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
em->block_start != EXTENT_MAP_HOLE)) {
int type;
- int ret;
u64 block_start, orig_start, orig_block_len, ram_bytes;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a6d8efa46bfe..96b95e6f9f91 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -302,6 +302,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
goto out_drop;
} else {
+ ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
+ if (ret && ret != -ENODATA)
+ goto out_drop;
ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
@@ -679,23 +682,6 @@ out:
return ret;
}
-/* copy of check_sticky in fs/namei.c()
-* It's inline, so penalty for filesystems that don't use sticky bit is
-* minimal.
-*/
-static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
-{
- kuid_t fsuid = current_fsuid();
-
- if (!(dir->i_mode & S_ISVTX))
- return 0;
- if (uid_eq(inode->i_uid, fsuid))
- return 0;
- if (uid_eq(dir->i_uid, fsuid))
- return 0;
- return !capable(CAP_FOWNER);
-}
-
/* copy of may_delete in fs/namei.c()
* Check whether we can remove a link victim from directory dir, check
* whether the type of victim is right.
@@ -731,8 +717,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
return error;
if (IS_APPEND(dir))
return -EPERM;
- if (btrfs_check_sticky(dir, victim->d_inode)||
- IS_APPEND(victim->d_inode)||
+ if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) ||
IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
return -EPERM;
if (isdir) {
@@ -4750,6 +4735,12 @@ long btrfs_ioctl(struct file *file, unsigned int
if (ret)
return ret;
ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
+ /*
+ * The transaction thread may want to do more work,
+ * namely it pokes the cleaner ktread that will start
+ * processing uncleaned subvols.
+ */
+ wake_up_process(root->fs_info->transaction_kthread);
return ret;
}
case BTRFS_IOC_START_SYNC:
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 07b3b36f40ee..01f977e3ce09 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -736,7 +736,8 @@ again:
err = ret;
goto out;
}
- BUG_ON(!ret || !path1->slots[0]);
+ ASSERT(ret);
+ ASSERT(path1->slots[0]);
path1->slots[0]--;
@@ -746,10 +747,10 @@ again:
* the backref was added previously when processing
* backref of type BTRFS_TREE_BLOCK_REF_KEY
*/
- BUG_ON(!list_is_singular(&cur->upper));
+ ASSERT(list_is_singular(&cur->upper));
edge = list_entry(cur->upper.next, struct backref_edge,
list[LOWER]);
- BUG_ON(!list_empty(&edge->list[UPPER]));
+ ASSERT(list_empty(&edge->list[UPPER]));
exist = edge->node[UPPER];
/*
* add the upper level block to pending list if we need
@@ -831,7 +832,7 @@ again:
cur->cowonly = 1;
}
#else
- BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+ ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
#endif
if (key.objectid == key.offset) {
@@ -840,7 +841,7 @@ again:
* backref of this type.
*/
root = find_reloc_root(rc, cur->bytenr);
- BUG_ON(!root);
+ ASSERT(root);
cur->root = root;
break;
}
@@ -868,7 +869,7 @@ again:
} else {
upper = rb_entry(rb_node, struct backref_node,
rb_node);
- BUG_ON(!upper->checked);
+ ASSERT(upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
}
list_add_tail(&edge->list[LOWER], &cur->upper);
@@ -892,7 +893,7 @@ again:
if (btrfs_root_level(&root->root_item) == cur->level) {
/* tree root */
- BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+ ASSERT(btrfs_root_bytenr(&root->root_item) ==
cur->bytenr);
if (should_ignore_root(root))
list_add(&cur->list, &useless);
@@ -927,7 +928,7 @@ again:
need_check = true;
for (; level < BTRFS_MAX_LEVEL; level++) {
if (!path2->nodes[level]) {
- BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+ ASSERT(btrfs_root_bytenr(&root->root_item) ==
lower->bytenr);
if (should_ignore_root(root))
list_add(&lower->list, &useless);
@@ -976,12 +977,15 @@ again:
need_check = false;
list_add_tail(&edge->list[UPPER],
&list);
- } else
+ } else {
+ if (upper->checked)
+ need_check = true;
INIT_LIST_HEAD(&edge->list[UPPER]);
+ }
} else {
upper = rb_entry(rb_node, struct backref_node,
rb_node);
- BUG_ON(!upper->checked);
+ ASSERT(upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
if (!upper->owner)
upper->owner = btrfs_header_owner(eb);
@@ -1025,7 +1029,7 @@ next:
* everything goes well, connect backref nodes and insert backref nodes
* into the cache.
*/
- BUG_ON(!node->checked);
+ ASSERT(node->checked);
cowonly = node->cowonly;
if (!cowonly) {
rb_node = tree_insert(&cache->rb_root, node->bytenr,
@@ -1061,8 +1065,21 @@ next:
continue;
}
- BUG_ON(!upper->checked);
- BUG_ON(cowonly != upper->cowonly);
+ if (!upper->checked) {
+ /*
+ * Still want to blow up for developers since this is a
+ * logic bug.
+ */
+ ASSERT(0);
+ err = -EINVAL;
+ goto out;
+ }
+ if (cowonly != upper->cowonly) {
+ ASSERT(0);
+ err = -EINVAL;
+ goto out;
+ }
+
if (!cowonly) {
rb_node = tree_insert(&cache->rb_root, upper->bytenr,
&upper->rb_node);
@@ -1085,7 +1102,7 @@ next:
while (!list_empty(&useless)) {
upper = list_entry(useless.next, struct backref_node, list);
list_del_init(&upper->list);
- BUG_ON(!list_empty(&upper->upper));
+ ASSERT(list_empty(&upper->upper));
if (upper == node)
node = NULL;
if (upper->lowest) {
@@ -1118,29 +1135,45 @@ out:
if (err) {
while (!list_empty(&useless)) {
lower = list_entry(useless.next,
- struct backref_node, upper);
- list_del_init(&lower->upper);
+ struct backref_node, list);
+ list_del_init(&lower->list);
}
- upper = node;
- INIT_LIST_HEAD(&list);
- while (upper) {
- if (RB_EMPTY_NODE(&upper->rb_node)) {
- list_splice_tail(&upper->upper, &list);
- free_backref_node(cache, upper);
- }
-
- if (list_empty(&list))
- break;
-
- edge = list_entry(list.next, struct backref_edge,
- list[LOWER]);
+ while (!list_empty(&list)) {
+ edge = list_first_entry(&list, struct backref_edge,
+ list[UPPER]);
+ list_del(&edge->list[UPPER]);
list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
upper = edge->node[UPPER];
free_backref_edge(cache, edge);
+
+ /*
+ * Lower is no longer linked to any upper backref nodes
+ * and isn't in the cache, we can free it ourselves.
+ */
+ if (list_empty(&lower->upper) &&
+ RB_EMPTY_NODE(&lower->rb_node))
+ list_add(&lower->list, &useless);
+
+ if (!RB_EMPTY_NODE(&upper->rb_node))
+ continue;
+
+ /* Add this guy's upper edges to the list to proces */
+ list_for_each_entry(edge, &upper->upper, list[LOWER])
+ list_add_tail(&edge->list[UPPER], &list);
+ if (list_empty(&upper->upper))
+ list_add(&upper->list, &useless);
+ }
+
+ while (!list_empty(&useless)) {
+ lower = list_entry(useless.next,
+ struct backref_node, list);
+ list_del_init(&lower->list);
+ free_backref_node(cache, lower);
}
return ERR_PTR(err);
}
- BUG_ON(node && node->detached);
+ ASSERT(!node || !node->detached);
return node;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a65ed4cb436b..20d793542096 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4728,7 +4728,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
if (S_ISREG(sctx->cur_inode_mode)) {
if (need_send_hole(sctx)) {
- if (sctx->cur_inode_last_extent == (u64)-1) {
+ if (sctx->cur_inode_last_extent == (u64)-1 ||
+ sctx->cur_inode_last_extent <
+ sctx->cur_inode_size) {
ret = get_last_extent(sctx, (u64)-1);
if (ret)
goto out;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b05bf58b9395..a0b65a01fed7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -592,7 +592,6 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
if (transid <= root->fs_info->last_trans_committed)
goto out;
- ret = -EINVAL;
/* find specified transaction */
spin_lock(&root->fs_info->trans_lock);
list_for_each_entry(t, &root->fs_info->trans_list, list) {
@@ -608,9 +607,16 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
}
}
spin_unlock(&root->fs_info->trans_lock);
- /* The specified transaction doesn't exist */
- if (!cur_trans)
+
+ /*
+ * The specified transaction doesn't exist, or we
+ * raced with btrfs_commit_transaction
+ */
+ if (!cur_trans) {
+ if (transid > root->fs_info->last_trans_committed)
+ ret = -EINVAL;
goto out;
+ }
} else {
/* find newest transaction that is committing | committed */
spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 39d83da03e03..ec8b6542c8bd 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -979,7 +979,7 @@ again:
base = btrfs_item_ptr_offset(leaf, path->slots[0]);
while (cur_offset < item_size) {
- extref = (struct btrfs_inode_extref *)base + cur_offset;
+ extref = (struct btrfs_inode_extref *)(base + cur_offset);
victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
@@ -1235,13 +1235,14 @@ out:
}
static int insert_orphan_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 offset)
+ struct btrfs_root *root, u64 ino)
{
int ret;
- ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID,
- offset, BTRFS_ORPHAN_ITEM_KEY, NULL);
- if (ret > 0)
- ret = btrfs_insert_orphan_item(trans, root, offset);
+
+ ret = btrfs_insert_orphan_item(trans, root, ino);
+ if (ret == -EEXIST)
+ ret = 0;
+
return ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 27265a8b43c1..eef21c69f2d7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -227,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
int all_mapped = 1;
index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
- page = find_get_page(bd_mapping, index);
+ page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
if (!page)
goto out;
@@ -1029,7 +1029,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
bh = page_buffers(page);
if (bh->b_size == size) {
end_block = init_page_buffers(page, bdev,
- index << sizebits, size);
+ (sector_t)index << sizebits,
+ size);
goto done;
}
if (!try_to_free_buffers(page))
@@ -1050,7 +1051,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
*/
spin_lock(&inode->i_mapping->private_lock);
link_dev_buffers(page, bh);
- end_block = init_page_buffers(page, bdev, index << sizebits, size);
+ end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
+ size);
spin_unlock(&inode->i_mapping->private_lock);
done:
ret = (block < end_block) ? 1 : -ENXIO;
@@ -1366,12 +1368,13 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
if (bh == NULL) {
+ /* __find_get_block_slow will mark the page accessed */
bh = __find_get_block_slow(bdev, block);
if (bh)
bh_lru_install(bh);
- }
- if (bh)
+ } else
touch_buffer(bh);
+
return bh;
}
EXPORT_SYMBOL(__find_get_block);
@@ -1483,16 +1486,27 @@ EXPORT_SYMBOL(set_bh_page);
/*
* Called when truncating a buffer on a page completely.
*/
+
+/* Bits that are cleared during an invalidate */
+#define BUFFER_FLAGS_DISCARD \
+ (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
+ 1 << BH_Delay | 1 << BH_Unwritten)
+
static void discard_buffer(struct buffer_head * bh)
{
+ unsigned long b_state, b_state_old;
+
lock_buffer(bh);
clear_buffer_dirty(bh);
bh->b_bdev = NULL;
- clear_buffer_mapped(bh);
- clear_buffer_req(bh);
- clear_buffer_new(bh);
- clear_buffer_delay(bh);
- clear_buffer_unwritten(bh);
+ b_state = bh->b_state;
+ for (;;) {
+ b_state_old = cmpxchg(&bh->b_state, b_state,
+ (b_state & ~BUFFER_FLAGS_DISCARD));
+ if (b_state_old == b_state)
+ break;
+ b_state = b_state_old;
+ }
unlock_buffer(bh);
}
@@ -2075,6 +2089,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
+ loff_t old_size = inode->i_size;
int i_size_changed = 0;
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2094,6 +2109,8 @@ int generic_write_end(struct file *file, struct address_space *mapping,
unlock_page(page);
page_cache_release(page);
+ if (old_size < pos)
+ pagecache_isize_extended(inode, old_size, pos);
/*
* Don't mark the inode dirty under page lock. First, it unnecessarily
* makes the holding time of page lock longer. Second, it forces lock
@@ -2311,6 +2328,11 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
err = 0;
balance_dirty_pages_ratelimited(mapping);
+
+ if (unlikely(fatal_signal_pending(current))) {
+ err = -EINTR;
+ goto out;
+ }
}
/* page covers the boundary, find the boundary offset */
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 622f4696e484..5b99bafc31d1 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -124,7 +124,6 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
/* check parameters */
ret = -EOPNOTSUPP;
if (!root->d_inode ||
- !root->d_inode->i_op ||
!root->d_inode->i_op->lookup ||
!root->d_inode->i_op->mkdir ||
!root->d_inode->i_op->setxattr ||
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index ca65f39dc8dc..c0a681705104 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -391,12 +391,12 @@ try_again:
path.dentry = dir;
path_to_graveyard.mnt = cache->mnt;
path_to_graveyard.dentry = cache->graveyard;
- ret = security_path_rename(&path, rep, &path_to_graveyard, grave);
+ ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0);
if (ret < 0) {
cachefiles_io_error(cache, "Rename security error %d", ret);
} else {
ret = vfs_rename(dir->d_inode, rep,
- cache->graveyard->d_inode, grave, NULL);
+ cache->graveyard->d_inode, grave, NULL, 0);
if (ret != 0 && ret != -ENOMEM)
cachefiles_io_error(cache,
"Rename failed with error %d", ret);
@@ -779,8 +779,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
}
ret = -EPERM;
- if (!subdir->d_inode->i_op ||
- !subdir->d_inode->i_op->setxattr ||
+ if (!subdir->d_inode->i_op->setxattr ||
!subdir->d_inode->i_op->getxattr ||
!subdir->d_inode->i_op->lookup ||
!subdir->d_inode->i_op->mkdir ||
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index b53278c9fd97..94a85ee5b990 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -676,7 +676,7 @@ static int ceph_writepages_start(struct address_space *mapping,
int rc = 0;
unsigned wsize = 1 << inode->i_blkbits;
struct ceph_osd_request *req = NULL;
- int do_sync;
+ int do_sync = 0;
u64 truncate_size, snap_size;
u32 truncate_seq;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index f4f050a69a48..339c41216d14 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1461,15 +1461,18 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
dout("discard_cap_releases mds%d\n", session->s_mds);
- /* zero out the in-progress message */
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
- head = msg->front.iov_base;
- num = le32_to_cpu(head->num);
- dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
- head->num = cpu_to_le32(0);
- msg->front.iov_len = sizeof(*head);
- session->s_num_cap_releases += num;
+ if (!list_empty(&session->s_cap_releases)) {
+ /* zero out the in-progress message */
+ msg = list_first_entry(&session->s_cap_releases,
+ struct ceph_msg, list_head);
+ head = msg->front.iov_base;
+ num = le32_to_cpu(head->num);
+ dout("discard_cap_releases mds%d %p %u\n",
+ session->s_mds, msg, num);
+ head->num = cpu_to_le32(0);
+ msg->front.iov_len = sizeof(*head);
+ session->s_num_cap_releases += num;
+ }
/* requeue completed messages */
while (!list_empty(&session->s_cap_releases_done)) {
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f15d4353f30f..5d12d69e2045 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -399,6 +399,8 @@ struct smb_version_operations {
const struct cifs_fid *, u32 *);
int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *,
int);
+ /* check if we need to issue closedir */
+ bool (*dir_needs_close)(struct cifsFileInfo *);
};
struct smb_version_values {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8175b18df819..40ddb6e93912 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -366,6 +366,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
struct cifsLockInfo *li, *tmp;
struct cifs_fid fid;
struct cifs_pending_open open;
+ bool oplock_break_cancelled;
spin_lock(&cifs_file_list_lock);
if (--cifs_file->count > 0) {
@@ -397,7 +398,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
}
spin_unlock(&cifs_file_list_lock);
- cancel_work_sync(&cifs_file->oplock_break);
+ oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break);
if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
struct TCP_Server_Info *server = tcon->ses->server;
@@ -409,6 +410,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
_free_xid(xid);
}
+ if (oplock_break_cancelled)
+ cifs_done_oplock_break(cifsi);
+
cifs_del_pending_open(&open);
/*
@@ -762,7 +766,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
cifs_dbg(FYI, "Freeing private data in close dir\n");
spin_lock(&cifs_file_list_lock);
- if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
+ if (server->ops->dir_needs_close(cfile)) {
cfile->invalidHandle = true;
spin_unlock(&cifs_file_list_lock);
if (server->ops->close_dir)
@@ -1817,6 +1821,7 @@ refind_writable:
cifsFileInfo_put(inv_file);
spin_lock(&cifs_file_list_lock);
++refind;
+ inv_file = NULL;
goto refind_writable;
}
}
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 77492301cc2b..dfc95646b88c 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -86,21 +86,16 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
}
src_inode = src_file.file->f_dentry->d_inode;
+ rc = -EINVAL;
+ if (S_ISDIR(src_inode->i_mode))
+ goto out_fput;
/*
* Note: cifs case is easier than btrfs since server responsible for
* checks for proper open modes and file type and if it wants
* server could even support copy of range where source = target
*/
-
- /* so we do not deadlock racing two ioctls on same files */
- if (target_inode < src_inode) {
- mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
- } else {
- mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_CHILD);
- }
+ lock_two_nondirectories(target_inode, src_inode);
/* determine range to clone */
rc = -EINVAL;
@@ -124,13 +119,7 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
out_unlock:
/* although unlocking in the reverse order from locking is not
strictly necessary here it is a little cleaner to be consistent */
- if (target_inode < src_inode) {
- mutex_unlock(&src_inode->i_mutex);
- mutex_unlock(&target_inode->i_mutex);
- } else {
- mutex_unlock(&target_inode->i_mutex);
- mutex_unlock(&src_inode->i_mutex);
- }
+ unlock_two_nondirectories(src_inode, target_inode);
out_fput:
fdput(src_file);
out_drop_write:
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 68559fd557fb..a5c2812ead68 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -213,8 +213,12 @@ create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
goto out;
- rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb,
- fromName, buf, &bytes_written);
+ if (tcon->ses->server->ops->create_mf_symlink)
+ rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon,
+ cifs_sb, fromName, buf, &bytes_written);
+ else
+ rc = -EOPNOTSUPP;
+
if (rc)
goto out;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 2bbf11b09214..b334a89d6a66 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -593,7 +593,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
/* close and restart search */
cifs_dbg(FYI, "search backing up - close and restart search\n");
spin_lock(&cifs_file_list_lock);
- if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
+ if (server->ops->dir_needs_close(cfile)) {
cfile->invalidHandle = true;
spin_unlock(&cifs_file_list_lock);
if (server->ops->close_dir)
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index d1fdfa848703..e9ad8d37bb00 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -586,7 +586,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
if (tmprc == -EOPNOTSUPP)
*symlink = true;
- else
+ else if (tmprc == 0)
CIFSSMBClose(xid, tcon, fid.netfid);
}
@@ -1009,6 +1009,12 @@ cifs_is_read_op(__u32 oplock)
return oplock == OPLOCK_READ;
}
+static bool
+cifs_dir_needs_close(struct cifsFileInfo *cfile)
+{
+ return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle;
+}
+
struct smb_version_operations smb1_operations = {
.send_cancel = send_nt_cancel,
.compare_fids = cifs_compare_fids,
@@ -1078,6 +1084,7 @@ struct smb_version_operations smb1_operations = {
.query_mf_symlink = cifs_query_mf_symlink,
.create_mf_symlink = cifs_create_mf_symlink,
.is_read_op = cifs_is_read_op,
+ .dir_needs_close = cifs_dir_needs_close,
#ifdef CONFIG_CIFS_XATTR
.query_all_EAs = CIFSSMBQAllEAs,
.set_EA = CIFSSMBSetEA,
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index e31a9dfdcd39..a491814cb2c0 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -214,7 +214,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"},
{STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"},
{STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"},
- {STATUS_NO_MORE_FILES, -EIO, "STATUS_NO_MORE_FILES"},
+ {STATUS_NO_MORE_FILES, -ENODATA, "STATUS_NO_MORE_FILES"},
{STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"},
{STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"},
{STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"},
@@ -256,6 +256,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO,
"STATUS_DLL_MIGHT_BE_INCOMPATIBLE"},
{STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"},
+ {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EOPNOTSUPP,
+ "STATUS_REPARSE_NOT_HANDLED"},
{STATUS_DEVICE_REQUIRES_CLEANING, -EIO,
"STATUS_DEVICE_REQUIRES_CLEANING"},
{STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"},
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index f8977b2d9187..30f3eb5bc022 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -630,7 +630,8 @@ smb2_clone_range(const unsigned int xid,
/* No need to change MaxChunks since already set to 1 */
chunk_sizes_updated = true;
- }
+ } else
+ goto cchunk_out;
}
cchunk_out:
@@ -1102,6 +1103,12 @@ smb3_parse_lease_buf(void *buf, unsigned int *epoch)
return le32_to_cpu(lc->lcontext.LeaseState);
}
+static bool
+smb2_dir_needs_close(struct cifsFileInfo *cfile)
+{
+ return !cfile->invalidHandle;
+}
+
struct smb_version_operations smb20_operations = {
.compare_fids = smb2_compare_fids,
.setup_request = smb2_setup_request,
@@ -1175,6 +1182,7 @@ struct smb_version_operations smb20_operations = {
.create_lease_buf = smb2_create_lease_buf,
.parse_lease_buf = smb2_parse_lease_buf,
.clone_range = smb2_clone_range,
+ .dir_needs_close = smb2_dir_needs_close,
};
struct smb_version_operations smb21_operations = {
@@ -1250,6 +1258,7 @@ struct smb_version_operations smb21_operations = {
.create_lease_buf = smb2_create_lease_buf,
.parse_lease_buf = smb2_parse_lease_buf,
.clone_range = smb2_clone_range,
+ .dir_needs_close = smb2_dir_needs_close,
};
struct smb_version_operations smb30_operations = {
@@ -1328,6 +1337,7 @@ struct smb_version_operations smb30_operations = {
.parse_lease_buf = smb3_parse_lease_buf,
.clone_range = smb2_clone_range,
.validate_negotiate = smb3_validate_negotiate,
+ .dir_needs_close = smb2_dir_needs_close,
};
struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 9aab8fe0e508..348792911e1f 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2136,6 +2136,10 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base;
if (rc) {
+ if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) {
+ srch_inf->endOfSearch = true;
+ rc = 0;
+ }
cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
goto qdir_exit;
}
@@ -2173,11 +2177,6 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
else
cifs_dbg(VFS, "illegal search buffer type\n");
- if (rsp->hdr.Status == STATUS_NO_MORE_FILES)
- srch_inf->endOfSearch = 1;
- else
- srch_inf->endOfSearch = 0;
-
return rc;
qdir_exit:
diff --git a/fs/compat.c b/fs/compat.c
index 6af20de2c1a3..14da9b327ffa 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -837,10 +837,12 @@ struct compat_readdir_callback {
int result;
};
-static int compat_fillonedir(void *__buf, const char *name, int namlen,
- loff_t offset, u64 ino, unsigned int d_type)
+static int compat_fillonedir(struct dir_context *ctx, const char *name,
+ int namlen, loff_t offset, u64 ino,
+ unsigned int d_type)
{
- struct compat_readdir_callback *buf = __buf;
+ struct compat_readdir_callback *buf =
+ container_of(ctx, struct compat_readdir_callback, ctx);
struct compat_old_linux_dirent __user *dirent;
compat_ulong_t d_ino;
@@ -905,11 +907,12 @@ struct compat_getdents_callback {
int error;
};
-static int compat_filldir(void *__buf, const char *name, int namlen,
+static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct compat_linux_dirent __user * dirent;
- struct compat_getdents_callback *buf = __buf;
+ struct compat_getdents_callback *buf =
+ container_of(ctx, struct compat_getdents_callback, ctx);
compat_ulong_t d_ino;
int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
namlen + 2, sizeof(compat_long_t));
@@ -991,11 +994,13 @@ struct compat_getdents_callback64 {
int error;
};
-static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t offset,
- u64 ino, unsigned int d_type)
+static int compat_filldir64(struct dir_context *ctx, const char *name,
+ int namlen, loff_t offset, u64 ino,
+ unsigned int d_type)
{
struct linux_dirent64 __user *dirent;
- struct compat_getdents_callback64 *buf = __buf;
+ struct compat_getdents_callback64 *buf =
+ container_of(ctx, struct compat_getdents_callback64, ctx);
int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
sizeof(u64));
u64 off;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 06610cf94d57..a1f801c14fbc 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -195,8 +195,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
struct page *page = NULL;
if (blocknr + i < devsize) {
- page = read_mapping_page_async(mapping, blocknr + i,
- NULL);
+ page = read_mapping_page(mapping, blocknr + i, NULL);
/* synchronous error? */
if (IS_ERR(page))
page = NULL;
diff --git a/fs/dcache.c b/fs/dcache.c
index 58d57da91d2a..e3c556351810 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2481,12 +2481,14 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
dentry->d_name.name = dentry->d_iname;
} else {
/*
- * Both are internal. Just copy target to dentry
+ * Both are internal.
*/
- memcpy(dentry->d_iname, target->d_name.name,
- target->d_name.len + 1);
- dentry->d_name.len = target->d_name.len;
- return;
+ unsigned int i;
+ BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
+ for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
+ swap(((long *) &dentry->d_iname)[i],
+ ((long *) &target->d_iname)[i]);
+ }
}
}
swap(dentry->d_name.len, target->d_name.len);
@@ -2543,13 +2545,15 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry,
* __d_move - move a dentry
* @dentry: entry to move
* @target: new dentry
+ * @exchange: exchange the two dentries
*
* Update the dcache to reflect the move of a file name. Negative
* dcache entries should not be moved in this way. Caller must hold
* rename_lock, the i_mutex of the source and target directories,
* and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
*/
-static void __d_move(struct dentry * dentry, struct dentry * target)
+static void __d_move(struct dentry *dentry, struct dentry *target,
+ bool exchange)
{
if (!dentry->d_inode)
printk(KERN_WARNING "VFS: moving negative dcache entry\n");
@@ -2571,8 +2575,15 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
__d_drop(dentry);
__d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
- /* Unhash the target: dput() will then get rid of it */
+ /*
+ * Unhash the target (d_delete() is not usable here). If exchanging
+ * the two dentries, then rehash onto the other's hash queue.
+ */
__d_drop(target);
+ if (exchange) {
+ __d_rehash(target,
+ d_hash(dentry->d_parent, dentry->d_name.hash));
+ }
list_del(&dentry->d_u.d_child);
list_del(&target->d_u.d_child);
@@ -2599,6 +2610,8 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
write_seqcount_end(&dentry->d_seq);
dentry_unlock_parents_for_move(dentry, target);
+ if (exchange)
+ fsnotify_d_move(target);
spin_unlock(&target->d_lock);
fsnotify_d_move(dentry);
spin_unlock(&dentry->d_lock);
@@ -2616,11 +2629,30 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
void d_move(struct dentry *dentry, struct dentry *target)
{
write_seqlock(&rename_lock);
- __d_move(dentry, target);
+ __d_move(dentry, target, false);
write_sequnlock(&rename_lock);
}
EXPORT_SYMBOL(d_move);
+/*
+ * d_exchange - exchange two dentries
+ * @dentry1: first dentry
+ * @dentry2: second dentry
+ */
+void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
+{
+ write_seqlock(&rename_lock);
+
+ WARN_ON(!dentry1->d_inode);
+ WARN_ON(!dentry2->d_inode);
+ WARN_ON(IS_ROOT(dentry1));
+ WARN_ON(IS_ROOT(dentry2));
+
+ __d_move(dentry1, dentry2, true);
+
+ write_sequnlock(&rename_lock);
+}
+
/**
* d_ancestor - search for an ancestor
* @p1: ancestor dentry
@@ -2668,7 +2700,7 @@ static struct dentry *__d_unalias(struct inode *inode,
m2 = &alias->d_parent->d_inode->i_mutex;
out_unalias:
if (likely(!d_mountpoint(alias))) {
- __d_move(alias, dentry);
+ __d_move(alias, dentry, false);
ret = alias;
}
out_err:
@@ -2824,6 +2856,9 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
* the beginning of the name. The sequence number check at the caller will
* retry it again when a d_move() does happen. So any garbage in the buffer
* due to mismatched pointer and length will be discarded.
+ *
+ * Data dependency barrier is needed to make sure that we see that terminating
+ * NUL. Alpha strikes again, film at 11...
*/
static int prepend_name(char **buffer, int *buflen, struct qstr *name)
{
@@ -2831,6 +2866,8 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
u32 dlen = ACCESS_ONCE(name->len);
char *p;
+ smp_read_barrier_depends();
+
*buflen -= dlen + 1;
if (*buflen < 0)
return -ENAMETOOLONG;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 15761957cc3f..1ff8fe5dab0d 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -245,10 +245,19 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
+static void debugfs_evict_inode(struct inode *inode)
+{
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+ if (S_ISLNK(inode->i_mode))
+ kfree(inode->i_private);
+}
+
static const struct super_operations debugfs_super_operations = {
.statfs = simple_statfs,
.remount_fs = debugfs_remount,
.show_options = debugfs_show_options,
+ .evict_inode = debugfs_evict_inode,
};
static int debug_fill_super(struct super_block *sb, void *data, int silent)
@@ -465,23 +474,14 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
int ret = 0;
if (debugfs_positive(dentry)) {
- if (dentry->d_inode) {
- dget(dentry);
- switch (dentry->d_inode->i_mode & S_IFMT) {
- case S_IFDIR:
- ret = simple_rmdir(parent->d_inode, dentry);
- break;
- case S_IFLNK:
- kfree(dentry->d_inode->i_private);
- /* fall through */
- default:
- simple_unlink(parent->d_inode, dentry);
- break;
- }
- if (!ret)
- d_delete(dentry);
- dput(dentry);
- }
+ dget(dentry);
+ if (S_ISDIR(dentry->d_inode->i_mode))
+ ret = simple_rmdir(parent->d_inode, dentry);
+ else
+ simple_unlink(parent->d_inode, dentry);
+ if (!ret)
+ d_delete(dentry);
+ dput(dentry);
}
return ret;
}
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 2f6735dbf1a9..31b148f3e772 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1917,7 +1917,6 @@ ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
break;
case 2:
dst[dst_byte_offset++] |= (src_byte);
- dst[dst_byte_offset] = 0;
current_bit_offset = 0;
break;
}
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index b1eaa7a1f82c..121a94876d40 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -76,11 +76,11 @@ struct ecryptfs_getdents_callback {
/* Inspired by generic filldir in fs/readdir.c */
static int
-ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
- loff_t offset, u64 ino, unsigned int d_type)
+ecryptfs_filldir(struct dir_context *ctx, const char *lower_name,
+ int lower_namelen, loff_t offset, u64 ino, unsigned int d_type)
{
struct ecryptfs_getdents_callback *buf =
- (struct ecryptfs_getdents_callback *)dirent;
+ container_of(ctx, struct ecryptfs_getdents_callback, ctx);
size_t name_size;
char *name;
int rc;
@@ -191,23 +191,11 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
{
int rc = 0;
struct ecryptfs_crypt_stat *crypt_stat = NULL;
- struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
struct dentry *ecryptfs_dentry = file->f_path.dentry;
/* Private value of ecryptfs_dentry allocated in
* ecryptfs_lookup() */
struct ecryptfs_file_info *file_info;
- mount_crypt_stat = &ecryptfs_superblock_to_private(
- ecryptfs_dentry->d_sb)->mount_crypt_stat;
- if ((mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
- && ((file->f_flags & O_WRONLY) || (file->f_flags & O_RDWR)
- || (file->f_flags & O_CREAT) || (file->f_flags & O_TRUNC)
- || (file->f_flags & O_APPEND))) {
- printk(KERN_WARNING "Mount has encrypted view enabled; "
- "files may only be read\n");
- rc = -EPERM;
- goto out;
- }
/* Released in ecryptfs_release or end of function if failure */
file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL);
ecryptfs_set_file_private(file, file_info);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index b167ca48b8ee..57ee4c53b4f8 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -641,7 +641,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
lower_new_dir_dentry->d_inode, lower_new_dentry,
- NULL);
+ NULL, 0);
if (rc)
goto out_lock;
if (target_inode)
@@ -1039,7 +1039,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
}
rc = vfs_setxattr(lower_dentry, name, value, size, flags);
- if (!rc)
+ if (!rc && dentry->d_inode)
fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode);
out:
return rc;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 1b119d3bf924..d9eb84bda559 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -493,6 +493,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
{
struct super_block *s;
struct ecryptfs_sb_info *sbi;
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
struct ecryptfs_dentry_info *root_info;
const char *err = "Getting sb failed";
struct inode *inode;
@@ -511,6 +512,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
err = "Error parsing options";
goto out;
}
+ mount_crypt_stat = &sbi->mount_crypt_stat;
s = sget(fs_type, NULL, set_anon_super, flags, NULL);
if (IS_ERR(s)) {
@@ -557,15 +559,30 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
/**
* Set the POSIX ACL flag based on whether they're enabled in the lower
- * mount. Force a read-only eCryptfs mount if the lower mount is ro.
- * Allow a ro eCryptfs mount even when the lower mount is rw.
+ * mount.
*/
s->s_flags = flags & ~MS_POSIXACL;
- s->s_flags |= path.dentry->d_sb->s_flags & (MS_RDONLY | MS_POSIXACL);
+ s->s_flags |= path.dentry->d_sb->s_flags & MS_POSIXACL;
+
+ /**
+ * Force a read-only eCryptfs mount when:
+ * 1) The lower mount is ro
+ * 2) The ecryptfs_encrypted_view mount option is specified
+ */
+ if (path.dentry->d_sb->s_flags & MS_RDONLY ||
+ mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
+ s->s_flags |= MS_RDONLY;
s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
s->s_blocksize = path.dentry->d_sb->s_blocksize;
s->s_magic = ECRYPTFS_SUPER_MAGIC;
+ s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1;
+
+ rc = -EINVAL;
+ if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+ pr_err("eCryptfs: maximum fs stacking depth exceeded\n");
+ goto out_free;
+ }
inode = ecryptfs_get_inode(path.dentry->d_inode, s);
rc = PTR_ERR(inode);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index ead00467282d..f50d79eb52db 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1852,7 +1852,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
- ep_take_care_of_epollwakeup(&epds);
+ if (ep_op_has_event(op))
+ ep_take_care_of_epollwakeup(&epds);
/*
* We have to check that the file structure underneath the file descriptor
diff --git a/fs/exec.c b/fs/exec.c
index 31e46b1b358b..ea4449d0536a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,6 +26,7 @@
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/swap.h>
@@ -820,7 +821,7 @@ EXPORT_SYMBOL(read_code);
static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
- struct mm_struct * old_mm, *active_mm;
+ struct mm_struct *old_mm, *active_mm;
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
@@ -846,6 +847,8 @@ static int exec_mmap(struct mm_struct *mm)
tsk->mm = mm;
tsk->active_mm = mm;
activate_mm(active_mm, mm);
+ tsk->mm->vmacache_seqnum = 0;
+ vmacache_flush(tsk);
task_unlock(tsk);
if (old_mm) {
up_read(&old_mm->mmap_sem);
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 48a359dd286e..59d339c417c9 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -241,10 +241,11 @@ struct getdents_callback {
* A rather strange filldir function to capture
* the name matching the specified inode number.
*/
-static int filldir_one(void * __buf, const char * name, int len,
+static int filldir_one(struct dir_context *ctx, const char *name, int len,
loff_t pos, u64 ino, unsigned int d_type)
{
- struct getdents_callback *buf = __buf;
+ struct getdents_callback *buf =
+ container_of(ctx, struct getdents_callback, ctx);
int result = 0;
buf->sequence++;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 37fd31ed16e7..0498390f309e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1354,13 +1354,6 @@ set_qf_format:
"not specified.");
return 0;
}
- } else {
- if (sbi->s_jquota_fmt) {
- ext3_msg(sb, KERN_ERR, "error: journaled quota format "
- "specified with no journaling "
- "enabled.");
- return 0;
- }
}
#endif
return 1;
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 3285aa5a706a..b610779a958c 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -24,8 +24,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
__u32 provided, calculated;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return 1;
provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
@@ -46,8 +45,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
@@ -65,8 +63,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return 1;
provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
@@ -91,8 +88,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 62f024c051ce..2a6830a7af33 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2110,6 +2110,7 @@ int do_journal_get_write_access(handle_t *handle,
#define CONVERT_INLINE_DATA 2
extern struct inode *ext4_iget(struct super_block *, unsigned long);
+extern struct inode *ext4_iget_normal(struct super_block *, unsigned long);
extern int ext4_write_inode(struct inode *, struct writeback_control *);
extern int ext4_setattr(struct dentry *, struct iattr *);
extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
@@ -2340,10 +2341,18 @@ extern int ext4_register_li_request(struct super_block *sb,
static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
return EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_GDT_CSUM |
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM);
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM) ||
+ (EXT4_SB(sb)->s_chksum_driver != NULL);
}
+static inline int ext4_has_metadata_csum(struct super_block *sb)
+{
+ WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+ !EXT4_SB(sb)->s_chksum_driver);
+
+ return (EXT4_SB(sb)->s_chksum_driver != NULL);
+}
static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47188916dd8d..96a1ce159f51 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -74,8 +74,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode,
{
struct ext4_extent_tail *et;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(inode->i_sb))
return 1;
et = find_ext4_extent_tail(eh);
@@ -89,8 +88,7 @@ static void ext4_extent_block_csum_set(struct inode *inode,
{
struct ext4_extent_tail *et;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(inode->i_sb))
return;
et = find_ext4_extent_tail(eh);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2a8b2e15dcc4..589117e9e9be 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -100,7 +100,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
struct blk_plug plug;
int unaligned_aio = 0;
ssize_t ret;
- int overwrite = 0;
+ int *overwrite = iocb->private;
size_t length = iov_length(iov, nr_segs);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
@@ -118,8 +118,6 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
mutex_lock(&inode->i_mutex);
blk_start_plug(&plug);
- iocb->private = &overwrite;
-
/* check whether we do a DIO overwrite or not */
if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
!file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
@@ -143,7 +141,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
* So we should check these two conditions.
*/
if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
- overwrite = 1;
+ *overwrite = 1;
}
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
@@ -170,6 +168,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
{
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
+ int overwrite = 0;
/*
* If we have encountered a bitmap-format file, the size limit
@@ -190,6 +189,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
}
}
+ iocb->private = &overwrite;
if (unlikely(iocb->ki_filp->f_flags & O_DIRECT))
ret = ext4_file_dio_write(iocb, iov, nr_segs, pos);
else
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 64bb32f17903..a8d1a64d8cb0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -864,6 +864,10 @@ got:
struct buffer_head *block_bitmap_bh;
block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (!block_bitmap_bh) {
+ err = -EIO;
+ goto out;
+ }
BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
err = ext4_journal_get_write_access(handle, block_bitmap_bh);
if (err) {
@@ -988,8 +992,7 @@ got:
spin_unlock(&sbi->s_next_gen_lock);
/* Precompute checksum seed for inode metadata */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ if (ext4_has_metadata_csum(sb)) {
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = cpu_to_le32(inode->i_generation);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 82edf5b93352..8c03b747021b 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1128,8 +1128,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle,
memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
inline_size - EXT4_INLINE_DOTDOT_SIZE);
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
inode->i_size = inode->i_sb->s_blocksize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b56062dc8b62..3a7e0341447f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -83,8 +83,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_LINUX) ||
- !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ !ext4_has_metadata_csum(inode->i_sb))
return 1;
provided = le16_to_cpu(raw->i_checksum_lo);
@@ -105,8 +104,7 @@ static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_LINUX) ||
- !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ !ext4_has_metadata_csum(inode->i_sb))
return;
csum = ext4_inode_csum(inode, raw, ei);
@@ -2633,6 +2631,20 @@ static int ext4_nonda_switch(struct super_block *sb)
return 0;
}
+/* We always reserve for an inode update; the superblock could be there too */
+static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
+{
+ if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE)))
+ return 1;
+
+ if (pos + len <= 0x7fffffffULL)
+ return 1;
+
+ /* We might need to update the superblock to set LARGE_FILE */
+ return 2;
+}
+
static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -2683,7 +2695,8 @@ retry_grab:
* of file which has an already mapped buffer.
*/
retry_journal:
- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+ ext4_da_write_credits(inode, pos, len));
if (IS_ERR(handle)) {
page_cache_release(page);
return PTR_ERR(handle);
@@ -4061,8 +4074,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_extra_isize = 0;
/* Precompute checksum seed for inode metadata */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ if (ext4_has_metadata_csum(sb)) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
@@ -4250,6 +4262,13 @@ bad_inode:
return ERR_PTR(ret);
}
+struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
+{
+ if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
+ return ERR_PTR(-EIO);
+ return ext4_iget(sb, ino);
+}
+
static int ext4_inode_blocks_set(handle_t *handle,
struct ext4_inode *raw_inode,
struct ext4_inode_info *ei)
@@ -4645,8 +4664,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ext4_orphan_del(NULL, inode);
goto err_out;
}
- } else
+ } else {
+ loff_t oldsize = inode->i_size;
+
i_size_write(inode, attr->ia_size);
+ pagecache_isize_extended(inode, oldsize, inode->i_size);
+ }
/*
* Blocks are going to be removed from the inode. Wait
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a2a837f00407..dfe982dee0b3 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -343,8 +343,7 @@ flags_out:
if (!inode_owner_or_capable(inode))
return -EPERM;
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ if (ext4_has_metadata_csum(inode->i_sb)) {
ext4_warning(sb, "Setting inode version is not "
"supported with metadata_csum enabled.");
return -ENOTTY;
@@ -544,9 +543,17 @@ group_add_out:
}
case EXT4_IOC_SWAP_BOOT:
+ {
+ int err;
if (!(filp->f_mode & FMODE_WRITE))
return -EBADF;
- return swap_inode_boot_loader(sb, inode);
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ err = swap_inode_boot_loader(sb, inode);
+ mnt_drop_write_file(filp);
+ return err;
+ }
case EXT4_IOC_RESIZE_FS: {
ext4_fsblk_t n_blocks_count;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 242226a87be7..7620133f78bf 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1044,6 +1044,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
* allocating. If we are looking at the buddy cache we would
* have taken a reference using ext4_mb_load_buddy and that
* would have pinned buddy page to page cache.
+ * The call to ext4_mb_get_buddy_page_lock will mark the
+ * page accessed.
*/
ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
@@ -1062,7 +1064,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
ret = -EIO;
goto err;
}
- mark_page_accessed(page);
if (e4b.bd_buddy_page == NULL) {
/*
@@ -1082,7 +1083,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
ret = -EIO;
goto err;
}
- mark_page_accessed(page);
err:
ext4_mb_put_buddy_page_lock(&e4b);
return ret;
@@ -1141,7 +1141,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
/* we could use find_or_create_page(), but it locks page
* what we'd like to avoid in fast path ... */
- page = find_get_page(inode->i_mapping, pnum);
+ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
/*
@@ -1172,15 +1172,16 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
ret = -EIO;
goto err;
}
+
+ /* Pages marked accessed already */
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
- mark_page_accessed(page);
block++;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
+ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
page_cache_release(page);
@@ -1201,9 +1202,10 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
ret = -EIO;
goto err;
}
+
+ /* Pages marked accessed already */
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
- mark_page_accessed(page);
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 04434ad3e8e0..1268a1b5afa9 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -20,8 +20,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return 1;
return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
@@ -29,8 +28,7 @@ int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return;
mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d050e043e884..98de4b391e79 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -123,8 +123,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
"directory leaf block found instead of index block");
return ERR_PTR(-EIO);
}
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) ||
+ if (!ext4_has_metadata_csum(inode->i_sb) ||
buffer_verified(bh))
return bh;
@@ -339,8 +338,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
{
struct ext4_dir_entry_tail *t;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(inode->i_sb))
return 1;
t = get_dirent_tail(inode, dirent);
@@ -361,8 +359,7 @@ static void ext4_dirent_csum_set(struct inode *inode,
{
struct ext4_dir_entry_tail *t;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(inode->i_sb))
return;
t = get_dirent_tail(inode, dirent);
@@ -437,8 +434,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
struct dx_tail *t;
int count_offset, limit, count;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(inode->i_sb))
return 1;
c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -467,8 +463,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
struct dx_tail *t;
int count_offset, limit, count;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(inode->i_sb))
return;
c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -556,8 +551,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
EXT4_DIR_REC_LEN(2) - infosize;
- if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
@@ -566,8 +560,7 @@ static inline unsigned dx_node_limit(struct inode *dir)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
- if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
@@ -1429,7 +1422,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
dentry);
return ERR_PTR(-EIO);
}
- inode = ext4_iget(dir->i_sb, ino);
+ inode = ext4_iget_normal(dir->i_sb, ino);
if (inode == ERR_PTR(-ESTALE)) {
EXT4_ERROR_INODE(dir,
"deleted inode referenced: %u",
@@ -1460,7 +1453,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
return ERR_PTR(-EIO);
}
- return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
+ return d_obtain_alias(ext4_iget_normal(child->d_inode->i_sb, ino));
}
/*
@@ -1534,8 +1527,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
int csum_size = 0;
int err = 0, i;
- if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
bh2 = ext4_append(handle, dir, &newblock);
@@ -1704,8 +1696,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
int csum_size = 0;
int err;
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
if (!de) {
@@ -1772,8 +1763,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
struct fake_dirent *fde;
int csum_size = 0;
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
blocksize = dir->i_sb->s_blocksize;
@@ -1889,8 +1879,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
ext4_lblk_t block, blocks;
int csum_size = 0;
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
sb = dir->i_sb;
@@ -2152,8 +2141,7 @@ static int ext4_delete_entry(handle_t *handle,
return err;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
BUFFER_TRACE(bh, "get_write_access");
@@ -2372,8 +2360,7 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
int csum_size = 0;
int err;
- if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@@ -3204,6 +3191,16 @@ end_rename:
return retval;
}
+static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
+ return ext4_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
/*
* directories can handle most operations...
*/
@@ -3217,7 +3214,7 @@ const struct inode_operations ext4_dir_inode_operations = {
.rmdir = ext4_rmdir,
.mknod = ext4_mknod,
.tmpfile = ext4_tmpfile,
- .rename = ext4_rename,
+ .rename2 = ext4_rename2,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index f3b84cd9de56..2400ad1c3d12 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1071,7 +1071,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
break;
if (meta_bg == 0)
- backup_block = group * bpg + blk_off;
+ backup_block = ((ext4_fsblk_t)group) * bpg + blk_off;
else
backup_block = (ext4_group_first_block_no(sb, group) +
ext4_bg_has_super(sb, group));
@@ -1200,8 +1200,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb,
{
struct buffer_head *bh;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return 0;
bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a46030d6b4af..9fb3e6c0c578 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -140,8 +140,7 @@ static __le32 ext4_superblock_csum(struct super_block *sb,
int ext4_superblock_csum_verify(struct super_block *sb,
struct ext4_super_block *es)
{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return 1;
return es->s_checksum == ext4_superblock_csum(sb, es);
@@ -151,8 +150,7 @@ void ext4_superblock_csum_set(struct super_block *sb)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(sb))
return;
es->s_checksum = ext4_superblock_csum(sb, es);
@@ -996,7 +994,7 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
* Currently we don't know the generation for parent directory, so
* a generation of 0 means "accept any"
*/
- inode = ext4_iget(sb, ino);
+ inode = ext4_iget_normal(sb, ino);
if (IS_ERR(inode))
return ERR_CAST(inode);
if (generation && inode->i_generation != generation) {
@@ -1706,13 +1704,6 @@ static int parse_options(char *options, struct super_block *sb,
"not specified");
return 0;
}
- } else {
- if (sbi->s_jquota_fmt) {
- ext4_msg(sb, KERN_ERR, "journaled quota format "
- "specified with no journaling "
- "enabled");
- return 0;
- }
}
#endif
if (test_opt(sb, DIOREAD_NOLOCK)) {
@@ -2010,8 +2001,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
__u16 crc = 0;
__le32 le_group = cpu_to_le32(block_group);
- if ((sbi->s_es->s_feature_ro_compat &
- cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
+ if (ext4_has_metadata_csum(sbi->s_sb)) {
/* Use new metadata_csum algorithm */
__le16 save_csum;
__u32 csum32;
@@ -2029,6 +2019,10 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
}
/* old crc16 code */
+ if (!(sbi->s_es->s_feature_ro_compat &
+ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)))
+ return 0;
+
offset = offsetof(struct ext4_group_desc, bg_checksum);
crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
@@ -3167,8 +3161,7 @@ static int set_journal_csum_feature_set(struct super_block *sb)
int compat, incompat;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ if (ext4_has_metadata_csum(sb)) {
/* journal checksum v3 */
compat = 0;
incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
@@ -3475,8 +3468,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
/* Precompute checksum seed for all metadata */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (ext4_has_metadata_csum(sb))
sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
sizeof(es->s_uuid));
@@ -3494,6 +3486,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_EXT4_FS_POSIX_ACL
set_opt(sb, POSIX_ACL);
#endif
+ /* don't forget to enable journal_csum when metadata_csum is enabled. */
+ if (ext4_has_metadata_csum(sb))
+ set_opt(sb, JOURNAL_CHECKSUM);
+
if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
set_opt(sb, JOURNAL_DATA);
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 55e611c1513c..8825154b20b6 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -141,8 +141,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode,
sector_t block_nr,
struct ext4_xattr_header *hdr)
{
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+ if (ext4_has_metadata_csum(inode->i_sb) &&
(hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr)))
return 0;
return 1;
@@ -152,8 +151,7 @@ static void ext4_xattr_block_csum_set(struct inode *inode,
sector_t block_nr,
struct ext4_xattr_header *hdr)
{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_metadata_csum(inode->i_sb))
return;
hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr);
@@ -189,14 +187,28 @@ ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
}
static int
-ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
+ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
+ void *value_start)
{
- while (!IS_LAST_ENTRY(entry)) {
- struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry);
+ struct ext4_xattr_entry *e = entry;
+
+ while (!IS_LAST_ENTRY(e)) {
+ struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
if ((void *)next >= end)
return -EIO;
- entry = next;
+ e = next;
}
+
+ while (!IS_LAST_ENTRY(entry)) {
+ if (entry->e_value_size != 0 &&
+ (value_start + le16_to_cpu(entry->e_value_offs) <
+ (void *)e + sizeof(__u32) ||
+ value_start + le16_to_cpu(entry->e_value_offs) +
+ le32_to_cpu(entry->e_value_size) > end))
+ return -EIO;
+ entry = EXT4_XATTR_NEXT(entry);
+ }
+
return 0;
}
@@ -213,7 +225,8 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
return -EIO;
if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
return -EIO;
- error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+ error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size,
+ bh->b_data);
if (!error)
set_buffer_verified(bh);
return error;
@@ -329,7 +342,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
header = IHDR(inode, raw_inode);
entry = IFIRST(header);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- error = ext4_xattr_check_names(entry, end);
+ error = ext4_xattr_check_names(entry, end, entry);
if (error)
goto cleanup;
error = ext4_xattr_find_entry(&entry, name_index, name,
@@ -457,7 +470,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- error = ext4_xattr_check_names(IFIRST(header), end);
+ error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header));
if (error)
goto cleanup;
error = ext4_xattr_list_entries(dentry, IFIRST(header),
@@ -972,7 +985,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
is->s.here = is->s.first;
is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
- error = ext4_xattr_check_names(IFIRST(header), is->s.end);
+ error = ext4_xattr_check_names(IFIRST(header), is->s.end,
+ IFIRST(header));
if (error)
return error;
/* Find the named attribute. */
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 293d0486a40f..5c6fe278fb63 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -71,7 +71,6 @@ repeat:
goto repeat;
}
out:
- mark_page_accessed(page);
return page;
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b0649b76eb4f..bb6478acb369 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -969,7 +969,6 @@ repeat:
}
got_it:
f2fs_bug_on(nid != nid_of_node(page));
- mark_page_accessed(page);
return page;
}
@@ -1024,7 +1023,6 @@ page_hit:
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
- mark_page_accessed(page);
return page;
}
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 3963ede84eb0..c5d6bb939d19 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -702,10 +702,11 @@ static int fat_readdir(struct file *file, struct dir_context *ctx)
}
#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \
-static int func(void *__buf, const char *name, int name_len, \
+static int func(struct dir_context *ctx, const char *name, int name_len, \
loff_t offset, u64 ino, unsigned int d_type) \
{ \
- struct fat_ioctl_filldir_callback *buf = __buf; \
+ struct fat_ioctl_filldir_callback *buf = \
+ container_of(ctx, struct fat_ioctl_filldir_callback, ctx); \
struct dirent_type __user *d1 = buf->dirent; \
struct dirent_type __user *d2 = d1 + 1; \
\
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a16315957ef3..23a51f08e3b5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -476,12 +476,28 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* write_inode()
*/
spin_lock(&inode->i_lock);
- /* Clear I_DIRTY_PAGES if we've written out all dirty pages */
- if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- inode->i_state &= ~I_DIRTY_PAGES;
+
dirty = inode->i_state & I_DIRTY;
- inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ inode->i_state &= ~I_DIRTY;
+
+ /*
+ * Paired with smp_mb() in __mark_inode_dirty(). This allows
+ * __mark_inode_dirty() to test i_state without grabbing i_lock -
+ * either they see the I_DIRTY bits cleared or we see the dirtied
+ * inode.
+ *
+ * I_DIRTY_PAGES is always cleared together above even if @mapping
+ * still has dirty pages. The flag is reinstated after smp_mb() if
+ * necessary. This guarantees that either __mark_inode_dirty()
+ * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
+ */
+ smp_mb();
+
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ inode->i_state |= I_DIRTY_PAGES;
+
spin_unlock(&inode->i_lock);
+
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
int err = write_inode(inode, wbc);
@@ -1145,12 +1161,11 @@ void __mark_inode_dirty(struct inode *inode, int flags)
}
/*
- * make sure that changes are seen by all cpus before we test i_state
- * -- mikulas
+ * Paired with smp_mb() in __writeback_single_inode() for the
+ * following lockless i_state test. See there for details.
*/
smp_mb();
- /* avoid the locking if we can */
if ((inode->i_state & flags) == flags)
return;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 0a648bb455ae..499155ca3e84 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -819,8 +819,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
newpage = buf->page;
- if (WARN_ON(!PageUptodate(newpage)))
- return -EIO;
+ if (!PageUptodate(newpage))
+ SetPageUptodate(newpage);
ClearPageMappedToDisk(newpage);
@@ -1614,7 +1614,7 @@ out_finish:
static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
{
- release_pages(req->pages, req->num_pages, 0);
+ release_pages(req->pages, req->num_pages, false);
}
static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
@@ -1726,6 +1726,9 @@ copy_finish:
static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
unsigned int size, struct fuse_copy_state *cs)
{
+ /* Don't try to move pages (yet) */
+ cs->move_pages = 0;
+
switch (code) {
case FUSE_NOTIFY_POLL:
return fuse_notify_poll(fc, size, cs);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 342f0239fcbf..33dec83eccfb 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -680,6 +680,14 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
return create_new_entry(fc, req, dir, entry, S_IFLNK);
}
+static inline void fuse_update_ctime(struct inode *inode)
+{
+ if (!IS_NOCMTIME(inode)) {
+ inode->i_ctime = current_fs_time(inode->i_sb);
+ mark_inode_dirty_sync(inode);
+ }
+}
+
static int fuse_unlink(struct inode *dir, struct dentry *entry)
{
int err;
@@ -714,6 +722,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
fuse_invalidate_attr(inode);
fuse_invalidate_attr(dir);
fuse_invalidate_entry_cache(entry);
+ fuse_update_ctime(inode);
} else if (err == -EINTR)
fuse_invalidate_entry(entry);
return err;
@@ -744,23 +753,26 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
return err;
}
-static int fuse_rename(struct inode *olddir, struct dentry *oldent,
- struct inode *newdir, struct dentry *newent)
+static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
+ struct inode *newdir, struct dentry *newent,
+ unsigned int flags, int opcode, size_t argsize)
{
int err;
- struct fuse_rename_in inarg;
+ struct fuse_rename2_in inarg;
struct fuse_conn *fc = get_fuse_conn(olddir);
- struct fuse_req *req = fuse_get_req_nopages(fc);
+ struct fuse_req *req;
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
- memset(&inarg, 0, sizeof(inarg));
+ memset(&inarg, 0, argsize);
inarg.newdir = get_node_id(newdir);
- req->in.h.opcode = FUSE_RENAME;
+ inarg.flags = flags;
+ req->in.h.opcode = opcode;
req->in.h.nodeid = get_node_id(olddir);
req->in.numargs = 3;
- req->in.args[0].size = sizeof(inarg);
+ req->in.args[0].size = argsize;
req->in.args[0].value = &inarg;
req->in.args[1].size = oldent->d_name.len + 1;
req->in.args[1].value = oldent->d_name.name;
@@ -772,15 +784,22 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
if (!err) {
/* ctime changes */
fuse_invalidate_attr(oldent->d_inode);
+ fuse_update_ctime(oldent->d_inode);
+
+ if (flags & RENAME_EXCHANGE) {
+ fuse_invalidate_attr(newent->d_inode);
+ fuse_update_ctime(newent->d_inode);
+ }
fuse_invalidate_attr(olddir);
if (olddir != newdir)
fuse_invalidate_attr(newdir);
/* newent will end up negative */
- if (newent->d_inode) {
+ if (!(flags & RENAME_EXCHANGE) && newent->d_inode) {
fuse_invalidate_attr(newent->d_inode);
fuse_invalidate_entry_cache(newent);
+ fuse_update_ctime(newent->d_inode);
}
} else if (err == -EINTR) {
/* If request was interrupted, DEITY only knows if the
@@ -796,6 +815,36 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
return err;
}
+static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
+ struct inode *newdir, struct dentry *newent,
+ unsigned int flags)
+{
+ struct fuse_conn *fc = get_fuse_conn(olddir);
+ int err;
+
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ if (flags) {
+ if (fc->no_rename2 || fc->minor < 23)
+ return -EINVAL;
+
+ err = fuse_rename_common(olddir, oldent, newdir, newent, flags,
+ FUSE_RENAME2,
+ sizeof(struct fuse_rename2_in));
+ if (err == -ENOSYS) {
+ fc->no_rename2 = 1;
+ err = -EINVAL;
+ }
+ } else {
+ err = fuse_rename_common(olddir, oldent, newdir, newent, 0,
+ FUSE_RENAME,
+ sizeof(struct fuse_rename_in));
+ }
+
+ return err;
+}
+
static int fuse_link(struct dentry *entry, struct inode *newdir,
struct dentry *newent)
{
@@ -830,6 +879,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
inc_nlink(inode);
spin_unlock(&fc->lock);
fuse_invalidate_attr(inode);
+ fuse_update_ctime(inode);
} else if (err == -EINTR) {
fuse_invalidate_attr(inode);
}
@@ -840,6 +890,16 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
struct kstat *stat)
{
unsigned int blkbits;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ /* see the comment in fuse_change_attributes() */
+ if (fc->writeback_cache && S_ISREG(inode->i_mode)) {
+ attr->size = i_size_read(inode);
+ attr->mtime = inode->i_mtime.tv_sec;
+ attr->mtimensec = inode->i_mtime.tv_nsec;
+ attr->ctime = inode->i_ctime.tv_sec;
+ attr->ctimensec = inode->i_ctime.tv_nsec;
+ }
stat->dev = inode->i_sb->s_dev;
stat->ino = attr->ino;
@@ -1478,12 +1538,16 @@ static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
}
-static bool update_mtime(unsigned ivalid)
+static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
{
/* Always update if mtime is explicitly set */
if (ivalid & ATTR_MTIME_SET)
return true;
+ /* Or if kernel i_mtime is the official one */
+ if (trust_local_mtime)
+ return true;
+
/* If it's an open(O_TRUNC) or an ftruncate(), don't update */
if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE)))
return false;
@@ -1492,7 +1556,8 @@ static bool update_mtime(unsigned ivalid)
return true;
}
-static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
+static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
+ bool trust_local_mtime)
{
unsigned ivalid = iattr->ia_valid;
@@ -1511,11 +1576,11 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
if (!(ivalid & ATTR_ATIME_SET))
arg->valid |= FATTR_ATIME_NOW;
}
- if ((ivalid & ATTR_MTIME) && update_mtime(ivalid)) {
+ if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_mtime)) {
arg->valid |= FATTR_MTIME;
arg->mtime = iattr->ia_mtime.tv_sec;
arg->mtimensec = iattr->ia_mtime.tv_nsec;
- if (!(ivalid & ATTR_MTIME_SET))
+ if (!(ivalid & ATTR_MTIME_SET) && !trust_local_mtime)
arg->valid |= FATTR_MTIME_NOW;
}
}
@@ -1564,6 +1629,63 @@ void fuse_release_nowrite(struct inode *inode)
spin_unlock(&fc->lock);
}
+static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req,
+ struct inode *inode,
+ struct fuse_setattr_in *inarg_p,
+ struct fuse_attr_out *outarg_p)
+{
+ req->in.h.opcode = FUSE_SETATTR;
+ req->in.h.nodeid = get_node_id(inode);
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(*inarg_p);
+ req->in.args[0].value = inarg_p;
+ req->out.numargs = 1;
+ if (fc->minor < 9)
+ req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
+ else
+ req->out.args[0].size = sizeof(*outarg_p);
+ req->out.args[0].value = outarg_p;
+}
+
+/*
+ * Flush inode->i_mtime to the server
+ */
+int fuse_flush_mtime(struct file *file, bool nofail)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_req *req = NULL;
+ struct fuse_setattr_in inarg;
+ struct fuse_attr_out outarg;
+ int err;
+
+ if (nofail) {
+ req = fuse_get_req_nofail_nopages(fc, file);
+ } else {
+ req = fuse_get_req_nopages(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ }
+
+ memset(&inarg, 0, sizeof(inarg));
+ memset(&outarg, 0, sizeof(outarg));
+
+ inarg.valid |= FATTR_MTIME;
+ inarg.mtime = inode->i_mtime.tv_sec;
+ inarg.mtimensec = inode->i_mtime.tv_nsec;
+
+ fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
+ fuse_request_send(fc, req);
+ err = req->out.h.error;
+ fuse_put_request(fc, req);
+
+ if (!err)
+ clear_bit(FUSE_I_MTIME_DIRTY, &fi->state);
+
+ return err;
+}
+
/*
* Set attributes, and at the same time refresh them.
*
@@ -1581,8 +1703,10 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
bool is_truncate = false;
+ bool is_wb = fc->writeback_cache;
loff_t oldsize;
int err;
+ bool trust_local_mtime = is_wb && S_ISREG(inode->i_mode);
if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
attr->ia_valid |= ATTR_FORCE;
@@ -1611,7 +1735,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
- iattr_to_fattr(attr, &inarg);
+ iattr_to_fattr(attr, &inarg, trust_local_mtime);
if (file) {
struct fuse_file *ff = file->private_data;
inarg.valid |= FATTR_FH;
@@ -1622,17 +1746,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
inarg.valid |= FATTR_LOCKOWNER;
inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
}
- req->in.h.opcode = FUSE_SETATTR;
- req->in.h.nodeid = get_node_id(inode);
- req->in.numargs = 1;
- req->in.args[0].size = sizeof(inarg);
- req->in.args[0].value = &inarg;
- req->out.numargs = 1;
- if (fc->minor < 9)
- req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
- else
- req->out.args[0].size = sizeof(outarg);
- req->out.args[0].value = &outarg;
+ fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
@@ -1649,10 +1763,18 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
}
spin_lock(&fc->lock);
+ /* the kernel maintains i_mtime locally */
+ if (trust_local_mtime && (attr->ia_valid & ATTR_MTIME)) {
+ inode->i_mtime = attr->ia_mtime;
+ clear_bit(FUSE_I_MTIME_DIRTY, &fi->state);
+ }
+
fuse_change_attributes_common(inode, &outarg.attr,
attr_timeout(&outarg));
oldsize = inode->i_size;
- i_size_write(inode, outarg.attr.size);
+ /* see the comment in fuse_change_attributes() */
+ if (!is_wb || is_truncate || !S_ISREG(inode->i_mode))
+ i_size_write(inode, outarg.attr.size);
if (is_truncate) {
/* NOTE: this may release/reacquire fc->lock */
@@ -1664,7 +1786,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
* Only call invalidate_inode_pages2() after removing
* FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
*/
- if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
+ if ((is_truncate || !is_wb) &&
+ S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
truncate_pagecache(inode, outarg.attr.size);
invalidate_inode_pages2(inode->i_mapping);
}
@@ -1740,8 +1863,10 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
fc->no_setxattr = 1;
err = -EOPNOTSUPP;
}
- if (!err)
+ if (!err) {
fuse_invalidate_attr(inode);
+ fuse_update_ctime(inode);
+ }
return err;
}
@@ -1871,18 +1996,31 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
fc->no_removexattr = 1;
err = -EOPNOTSUPP;
}
- if (!err)
+ if (!err) {
fuse_invalidate_attr(inode);
+ fuse_update_ctime(inode);
+ }
return err;
}
+static int fuse_update_time(struct inode *inode, struct timespec *now,
+ int flags)
+{
+ if (flags & S_MTIME) {
+ inode->i_mtime = *now;
+ set_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state);
+ BUG_ON(!S_ISREG(inode->i_mode));
+ }
+ return 0;
+}
+
static const struct inode_operations fuse_dir_inode_operations = {
.lookup = fuse_lookup,
.mkdir = fuse_mkdir,
.symlink = fuse_symlink,
.unlink = fuse_unlink,
.rmdir = fuse_rmdir,
- .rename = fuse_rename,
+ .rename2 = fuse_rename2,
.link = fuse_link,
.setattr = fuse_setattr,
.create = fuse_create,
@@ -1915,6 +2053,7 @@ static const struct inode_operations fuse_common_inode_operations = {
.getxattr = fuse_getxattr,
.listxattr = fuse_listxattr,
.removexattr = fuse_removexattr,
+ .update_time = fuse_update_time,
};
static const struct inode_operations fuse_symlink_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 77bcc303c3ae..5916dc51599b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -188,6 +188,22 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
}
EXPORT_SYMBOL_GPL(fuse_do_open);
+static void fuse_link_write_file(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_file *ff = file->private_data;
+ /*
+ * file may be written through mmap, so chain it onto the
+ * inodes's write_file list
+ */
+ spin_lock(&fc->lock);
+ if (list_empty(&ff->write_entry))
+ list_add(&ff->write_entry, &fi->write_files);
+ spin_unlock(&fc->lock);
+}
+
void fuse_finish_open(struct inode *inode, struct file *file)
{
struct fuse_file *ff = file->private_data;
@@ -292,6 +308,9 @@ static int fuse_open(struct inode *inode, struct file *file)
static int fuse_release(struct inode *inode, struct file *file)
{
+ if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state))
+ fuse_flush_mtime(file, true);
+
fuse_release_common(file, FUSE_RELEASE);
/* return value is ignored by VFS */
@@ -459,6 +478,12 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
fuse_sync_writes(inode);
+ if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) {
+ int err = fuse_flush_mtime(file, false);
+ if (err)
+ goto out;
+ }
+
req = fuse_get_req_nopages(fc);
if (IS_ERR(req)) {
err = PTR_ERR(req);
@@ -655,6 +680,32 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
spin_unlock(&fc->lock);
}
+static void fuse_short_read(struct fuse_req *req, struct inode *inode,
+ u64 attr_ver)
+{
+ size_t num_read = req->out.args[0].size;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (fc->writeback_cache) {
+ /*
+ * A hole in a file. Some data after the hole are in page cache,
+ * but have not reached the client fs yet. So, the hole is not
+ * present there.
+ */
+ int i;
+ int start_idx = num_read >> PAGE_CACHE_SHIFT;
+ size_t off = num_read & (PAGE_CACHE_SIZE - 1);
+
+ for (i = start_idx; i < req->num_pages; i++) {
+ zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE);
+ off = 0;
+ }
+ } else {
+ loff_t pos = page_offset(req->pages[0]) + num_read;
+ fuse_read_update_size(inode, pos, attr_ver);
+ }
+}
+
static int fuse_readpage(struct file *file, struct page *page)
{
struct fuse_io_priv io = { .async = 0, .file = file };
@@ -692,18 +743,18 @@ static int fuse_readpage(struct file *file, struct page *page)
req->page_descs[0].length = count;
num_read = fuse_send_read(req, &io, pos, count, NULL);
err = req->out.h.error;
- fuse_put_request(fc, req);
if (!err) {
/*
* Short read means EOF. If file size is larger, truncate it
*/
if (num_read < count)
- fuse_read_update_size(inode, pos + num_read, attr_ver);
+ fuse_short_read(req, inode, attr_ver);
SetPageUptodate(page);
}
+ fuse_put_request(fc, req);
fuse_invalidate_atime(inode);
out:
unlock_page(page);
@@ -726,13 +777,9 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
/*
* Short read means EOF. If file size is larger, truncate it
*/
- if (!req->out.h.error && num_read < count) {
- loff_t pos;
+ if (!req->out.h.error && num_read < count)
+ fuse_short_read(req, inode, req->misc.read.attr_ver);
- pos = page_offset(req->pages[0]) + num_read;
- fuse_read_update_size(inode, pos,
- req->misc.read.attr_ver);
- }
fuse_invalidate_atime(inode);
}
@@ -922,16 +969,21 @@ static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
return req->misc.write.out.size;
}
-void fuse_write_update_size(struct inode *inode, loff_t pos)
+bool fuse_write_update_size(struct inode *inode, loff_t pos)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
+ bool ret = false;
spin_lock(&fc->lock);
fi->attr_version = ++fc->attr_version;
- if (pos > inode->i_size)
+ if (pos > inode->i_size) {
i_size_write(inode, pos);
+ ret = true;
+ }
spin_unlock(&fc->lock);
+
+ return ret;
}
static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
@@ -1003,13 +1055,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- pagefault_disable();
tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
- pagefault_enable();
flush_dcache_page(page);
- mark_page_accessed(page);
-
if (!tmp) {
unlock_page(page);
page_cache_release(page);
@@ -1946,20 +1994,9 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
{
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
- struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_inode *fi = get_fuse_inode(inode);
- struct fuse_file *ff = file->private_data;
- /*
- * file may be written through mmap, so chain it onto the
- * inodes's write_file list
- */
- spin_lock(&fc->lock);
- if (list_empty(&ff->write_entry))
- list_add(&ff->write_entry, &fi->write_files);
- spin_unlock(&fc->lock);
- }
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ fuse_link_write_file(file);
+
file_accessed(file);
vma->vm_ops = &fuse_file_vm_ops;
return 0;
@@ -2850,8 +2887,16 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
goto out;
/* we could have extended the file */
- if (!(mode & FALLOC_FL_KEEP_SIZE))
- fuse_write_update_size(inode, offset + length);
+ if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+ bool changed = fuse_write_update_size(inode, offset + length);
+
+ if (changed && fc->writeback_cache) {
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ inode->i_mtime = current_fs_time(inode->i_sb);
+ set_bit(FUSE_I_MTIME_DIRTY, &fi->state);
+ }
+ }
if (mode & FALLOC_FL_PUNCH_HOLE)
truncate_pagecache_range(inode, offset, offset + length - 1);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 2da5db2c8bdb..7cc58c976780 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -119,6 +119,8 @@ enum {
FUSE_I_INIT_RDPLUS,
/** An operation changing file size is in progress */
FUSE_I_SIZE_UNSTABLE,
+ /** i_mtime has been updated locally; a flush to userspace needed */
+ FUSE_I_MTIME_DIRTY,
};
struct fuse_conn;
@@ -480,6 +482,9 @@ struct fuse_conn {
/** Set if bdi is valid */
unsigned bdi_initialized:1;
+ /** write-back cache policy (default is write-through) */
+ unsigned writeback_cache:1;
+
/*
* The following bitfields are only for optimization purposes
* and hence races in setting them will not cause malfunction
@@ -539,6 +544,9 @@ struct fuse_conn {
/** Is fallocate not implemented by fs? */
unsigned no_fallocate:1;
+ /** Is rename with flags implemented by fs? */
+ unsigned no_rename2:1;
+
/** Use enhanced/automatic page cache invalidation. */
unsigned auto_inval_data:1;
@@ -873,7 +881,9 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd,
unsigned fuse_file_poll(struct file *file, poll_table *wait);
int fuse_dev_release(struct inode *inode, struct file *file);
-void fuse_write_update_size(struct inode *inode, loff_t pos);
+bool fuse_write_update_size(struct inode *inode, loff_t pos);
+
+int fuse_flush_mtime(struct file *file, bool nofail);
int fuse_do_setattr(struct inode *inode, struct iattr *attr,
struct file *file);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 73f6bcb44ea8..8fe0b485908c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -170,10 +170,13 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_blocks = attr->blocks;
inode->i_atime.tv_sec = attr->atime;
inode->i_atime.tv_nsec = attr->atimensec;
- inode->i_mtime.tv_sec = attr->mtime;
- inode->i_mtime.tv_nsec = attr->mtimensec;
- inode->i_ctime.tv_sec = attr->ctime;
- inode->i_ctime.tv_nsec = attr->ctimensec;
+ /* mtime from server may be stale due to local buffered write */
+ if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) {
+ inode->i_mtime.tv_sec = attr->mtime;
+ inode->i_mtime.tv_nsec = attr->mtimensec;
+ inode->i_ctime.tv_sec = attr->ctime;
+ inode->i_ctime.tv_nsec = attr->ctimensec;
+ }
if (attr->blksize != 0)
inode->i_blkbits = ilog2(attr->blksize);
@@ -197,6 +200,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
+ bool is_wb = fc->writeback_cache;
loff_t oldsize;
struct timespec old_mtime;
@@ -211,10 +215,16 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
fuse_change_attributes_common(inode, attr, attr_valid);
oldsize = inode->i_size;
- i_size_write(inode, attr->size);
+ /*
+ * In case of writeback_cache enabled, the cached writes beyond EOF
+ * extend local i_size without keeping userspace server in sync. So,
+ * attr->size coming from server can be stale. We cannot trust it.
+ */
+ if (!is_wb || !S_ISREG(inode->i_mode))
+ i_size_write(inode, attr->size);
spin_unlock(&fc->lock);
- if (S_ISREG(inode->i_mode)) {
+ if (!is_wb && S_ISREG(inode->i_mode)) {
bool inval = false;
if (oldsize != attr->size) {
@@ -243,6 +253,10 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
{
inode->i_mode = attr->mode & S_IFMT;
inode->i_size = attr->size;
+ inode->i_mtime.tv_sec = attr->mtime;
+ inode->i_mtime.tv_nsec = attr->mtimensec;
+ inode->i_ctime.tv_sec = attr->ctime;
+ inode->i_ctime.tv_nsec = attr->ctimensec;
if (S_ISREG(inode->i_mode)) {
fuse_init_common(inode);
fuse_init_file_inode(inode);
@@ -289,7 +303,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
return NULL;
if ((inode->i_state & I_NEW)) {
- inode->i_flags |= S_NOATIME|S_NOCMTIME;
+ inode->i_flags |= S_NOATIME;
+ if (!fc->writeback_cache || !S_ISREG(inode->i_mode))
+ inode->i_flags |= S_NOCMTIME;
inode->i_generation = generation;
inode->i_data.backing_dev_info = &fc->bdi;
fuse_init_inode(inode, attr);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 49436fa7cd4f..4ccb60d943bb 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -517,7 +517,6 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
p = kmap_atomic(page);
memcpy(buf + copied, p + offset, amt);
kunmap_atomic(p);
- mark_page_accessed(page);
page_cache_release(page);
copied += amt;
index++;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 8b9b3775e2e7..c41d255b6a7b 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -69,10 +69,12 @@ struct get_name_filldir {
char *name;
};
-static int get_name_filldir(void *opaque, const char *name, int length,
- loff_t offset, u64 inum, unsigned int type)
+static int get_name_filldir(struct dir_context *ctx, const char *name,
+ int length, loff_t offset, u64 inum,
+ unsigned int type)
{
- struct get_name_filldir *gnfd = opaque;
+ struct get_name_filldir *gnfd =
+ container_of(ctx, struct get_name_filldir, ctx);
if (inum != gnfd->inum.no_addr)
return 0;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5c524180c98e..bc643b97423c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -606,8 +606,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (!IS_ERR(inode)) {
d = d_splice_alias(inode, dentry);
error = PTR_ERR(d);
- if (IS_ERR(d))
+ if (IS_ERR(d)) {
+ inode = ERR_CAST(d);
goto fail_gunlock;
+ }
error = 0;
if (file) {
if (S_ISREG(inode->i_mode)) {
@@ -823,7 +825,6 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
d = d_splice_alias(inode, dentry);
if (IS_ERR(d)) {
- iput(inode);
gfs2_glock_dq_uninit(&gh);
return d;
}
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index c7f24690ed05..e7b149614f5e 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -97,6 +97,11 @@ const struct address_space_operations gfs2_meta_aops = {
.releasepage = gfs2_releasepage,
};
+const struct address_space_operations gfs2_rgrp_aops = {
+ .writepage = gfs2_aspace_writepage,
+ .releasepage = gfs2_releasepage,
+};
+
/**
* gfs2_getbuf - Get a buffer with a given address space
* @gl: the glock
@@ -131,7 +136,8 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
yield();
}
} else {
- page = find_lock_page(mapping, index);
+ page = find_get_page_flags(mapping, index,
+ FGP_LOCK|FGP_ACCESSED);
if (!page)
return NULL;
}
@@ -148,7 +154,6 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
map_bh(bh, sdp->sd_vfs, blkno);
unlock_page(page);
- mark_page_accessed(page);
page_cache_release(page);
return bh;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 4823b934208a..ac5d8027d335 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -38,12 +38,15 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
}
extern const struct address_space_operations gfs2_meta_aops;
+extern const struct address_space_operations gfs2_rgrp_aops;
static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
{
struct inode *inode = mapping->host;
if (mapping->a_ops == &gfs2_meta_aops)
return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
+ else if (mapping->a_ops == &gfs2_rgrp_aops)
+ return container_of(mapping, struct gfs2_sbd, sd_aspace);
else
return inode->i_sb->s_fs_info;
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c6872d09561a..f6c9d83aa39b 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -104,7 +104,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
mapping = &sdp->sd_aspace;
address_space_init_once(mapping);
- mapping->a_ops = &gfs2_meta_aops;
+ mapping->a_ops = &gfs2_rgrp_aops;
mapping->host = sb->s_bdev->bd_inode;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 6e560d56094b..754fdf8c6356 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -131,13 +131,16 @@ skip:
hfs_bnode_write(node, entry, data_off + key_len, entry_len);
hfs_bnode_dump(node);
- if (new_node) {
- /* update parent key if we inserted a key
- * at the start of the first node
- */
- if (!rec && new_node != node)
- hfs_brec_update_parent(fd);
+ /*
+ * update parent key if we inserted a key
+ * at the start of the node and it is not the new node
+ */
+ if (!rec && new_node != node) {
+ hfs_bnode_read_key(node, fd->search_key, data_off + size);
+ hfs_brec_update_parent(fd);
+ }
+ if (new_node) {
hfs_bnode_put(fd->bnode);
if (!new_node->parent) {
hfs_btree_inc_height(tree);
@@ -168,9 +171,6 @@ skip:
goto again;
}
- if (!rec)
- hfs_brec_update_parent(fd);
-
return 0;
}
@@ -370,6 +370,8 @@ again:
if (IS_ERR(parent))
return PTR_ERR(parent);
__hfs_brec_find(parent, fd, hfs_find_rec_by_key);
+ if (fd->record < 0)
+ return -ENOENT;
hfs_bnode_dump(parent);
rec = fd->record;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 4338ff32959d..5f2755117ce7 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -548,10 +548,11 @@ struct hppfs_dirent {
struct dentry *dentry;
};
-static int hppfs_filldir(void *d, const char *name, int size,
+static int hppfs_filldir(struct dir_context *ctx, const char *name, int size,
loff_t offset, u64 inode, unsigned int type)
{
- struct hppfs_dirent *dirent = d;
+ struct hppfs_dirent *dirent =
+ container_of(ctx, struct hppfs_dirent, ctx);
if (file_removed(dirent->dentry, name))
return 0;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d19b30ababf1..a4a8ed56e438 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1017,6 +1017,11 @@ static int __init init_hugetlbfs_fs(void)
int error;
int i;
+ if (!hugepages_supported()) {
+ pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n");
+ return -ENOTSUPP;
+ }
+
error = bdi_init(&hugetlbfs_backing_dev_info);
if (error)
return error;
diff --git a/fs/internal.h b/fs/internal.h
index 465742407466..dd41b12c13f1 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -42,7 +42,6 @@ extern void __init chrdev_init(void);
/*
* namei.c
*/
-extern int __inode_permission(struct inode *, int);
extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
const char *, unsigned int, struct path *);
@@ -135,12 +134,6 @@ extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
/*
- * splice.c
- */
-extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
- loff_t *opos, size_t len, unsigned int flags);
-
-/*
* pipe.c
*/
extern const struct file_operations pipefifo_fops;
diff --git a/fs/ioprio.c b/fs/ioprio.c
index e50170ca7c33..31666c92b46a 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -157,14 +157,16 @@ out:
int ioprio_best(unsigned short aprio, unsigned short bprio)
{
- unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
- unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
+ unsigned short aclass;
+ unsigned short bclass;
- if (aclass == IOPRIO_CLASS_NONE)
- aclass = IOPRIO_CLASS_BE;
- if (bclass == IOPRIO_CLASS_NONE)
- bclass = IOPRIO_CLASS_BE;
+ if (!ioprio_valid(aprio))
+ aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
+ if (!ioprio_valid(bprio))
+ bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
+ aclass = IOPRIO_PRIO_CLASS(aprio);
+ bclass = IOPRIO_PRIO_CLASS(bprio);
if (aclass == bclass)
return min(aprio, bprio);
if (aclass > bclass)
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index f488bbae541a..735d7522a3a9 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -30,6 +30,7 @@ struct rock_state {
int cont_size;
int cont_extent;
int cont_offset;
+ int cont_loops;
struct inode *inode;
};
@@ -73,6 +74,9 @@ static void init_rock_state(struct rock_state *rs, struct inode *inode)
rs->inode = inode;
}
+/* Maximum number of Rock Ridge continuation entries */
+#define RR_MAX_CE_ENTRIES 32
+
/*
* Returns 0 if the caller should continue scanning, 1 if the scan must end
* and -ve on error.
@@ -105,6 +109,8 @@ static int rock_continue(struct rock_state *rs)
goto out;
}
ret = -EIO;
+ if (++rs->cont_loops >= RR_MAX_CE_ENTRIES)
+ goto out;
bh = sb_bread(rs->inode->i_sb, rs->cont_extent);
if (bh) {
memcpy(rs->buffer, bh->b_data + rs->cont_offset,
@@ -356,6 +362,9 @@ repeat:
rs.cont_size = isonum_733(rr->u.CE.size);
break;
case SIG('E', 'R'):
+ /* Invalid length of ER tag id? */
+ if (rr->u.ER.len_id + offsetof(struct rock_ridge, u.ER.data) > rr->len)
+ goto out;
ISOFS_SB(inode->i_sb)->s_rock = 1;
printk(KERN_DEBUG "ISO 9660 Extensions: ");
{
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 9b329b55ffe3..bcbef08a4d8f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -525,6 +525,7 @@ static int do_one_pass(journal_t *journal,
!jbd2_descr_block_csum_verify(journal,
bh->b_data)) {
err = -EIO;
+ brelse(bh);
goto failed;
}
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index a69e426435dd..5b234db85854 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -687,7 +687,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
struct inode *inode = OFNI_EDONI_2SFFJ(f);
struct page *pg;
- pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
+ pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
(void *)jffs2_do_readpage_unlock, inode);
if (IS_ERR(pg))
return (void *)pg;
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 413ef89c2d1b..046fee8b6e9b 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -134,8 +134,6 @@ struct jffs2_sb_info {
struct rw_semaphore wbuf_sem; /* Protects the write buffer */
struct delayed_work wbuf_dwork; /* write-buffer write-out work */
- int wbuf_queued; /* non-zero delayed work is queued */
- spinlock_t wbuf_dwork_lock; /* protects wbuf_dwork and and wbuf_queued */
unsigned char *oobbuf;
int oobavail; /* How many bytes are available for JFFS2 in OOB */
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 7654e87b0428..9ad5ba4b299b 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -510,6 +510,10 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
sumlen = c->sector_size - je32_to_cpu(sm->offset);
sumptr = buf + buf_size - sumlen;
+ /* sm->offset maybe wrong but MAGIC maybe right */
+ if (sumlen > c->sector_size)
+ goto full_scan;
+
/* Now, make sure the summary itself is available */
if (sumlen > buf_size) {
/* Need to kmalloc for this. */
@@ -544,6 +548,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
}
}
+full_scan:
buf_ofs = jeb->offset;
if (!buf_size) {
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index a6597d60d76d..09ed55190ee2 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1162,10 +1162,6 @@ static void delayed_wbuf_sync(struct work_struct *work)
struct jffs2_sb_info *c = work_to_sb(work);
struct super_block *sb = OFNI_BS_2SFFJ(c);
- spin_lock(&c->wbuf_dwork_lock);
- c->wbuf_queued = 0;
- spin_unlock(&c->wbuf_dwork_lock);
-
if (!(sb->s_flags & MS_RDONLY)) {
jffs2_dbg(1, "%s()\n", __func__);
jffs2_flush_wbuf_gc(c, 0);
@@ -1180,14 +1176,9 @@ void jffs2_dirty_trigger(struct jffs2_sb_info *c)
if (sb->s_flags & MS_RDONLY)
return;
- spin_lock(&c->wbuf_dwork_lock);
- if (!c->wbuf_queued) {
+ delay = msecs_to_jiffies(dirty_writeback_interval * 10);
+ if (queue_delayed_work(system_long_wq, &c->wbuf_dwork, delay))
jffs2_dbg(1, "%s()\n", __func__);
- delay = msecs_to_jiffies(dirty_writeback_interval * 10);
- queue_delayed_work(system_long_wq, &c->wbuf_dwork, delay);
- c->wbuf_queued = 1;
- }
- spin_unlock(&c->wbuf_dwork_lock);
}
int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
@@ -1211,7 +1202,6 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
/* Initialise write buffer */
init_rwsem(&c->wbuf_sem);
- spin_lock_init(&c->wbuf_dwork_lock);
INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
c->wbuf_pagesize = c->mtd->writesize;
c->wbuf_ofs = 0xFFFFFFFF;
@@ -1251,7 +1241,6 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
/* Initialize write buffer */
init_rwsem(&c->wbuf_sem);
- spin_lock_init(&c->wbuf_dwork_lock);
INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
c->wbuf_pagesize = c->mtd->erasesize;
@@ -1311,7 +1300,6 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
/* Initialize write buffer */
init_rwsem(&c->wbuf_sem);
- spin_lock_init(&c->wbuf_dwork_lock);
INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
c->wbuf_pagesize = c->mtd->writesize;
@@ -1346,7 +1334,6 @@ int jffs2_ubivol_setup(struct jffs2_sb_info *c) {
return 0;
init_rwsem(&c->wbuf_sem);
- spin_lock_init(&c->wbuf_dwork_lock);
INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
c->wbuf_pagesize = c->mtd->writesize;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 1812f026960c..6ae664b489af 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -159,6 +159,12 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
msg.rpc_proc = &clnt->cl_procinfo[proc];
status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
+ if (status == -ECONNREFUSED) {
+ dprintk("lockd: NSM upcall RPC failed, status=%d, forcing rebind\n",
+ status);
+ rpc_force_rebind(clnt);
+ status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
+ }
if (status < 0)
dprintk("lockd: NSM upcall RPC failed, status=%d\n",
status);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 6bf06a07f3e0..59a53f664005 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -137,10 +137,6 @@ lockd(void *vrqstp)
dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
- if (!nlm_timeout)
- nlm_timeout = LOCKD_DFLT_TIMEO;
- nlmsvc_timeout = nlm_timeout * HZ;
-
/*
* The main request loop. We don't terminate until the last
* NFS mount or NFS daemon has gone away.
@@ -253,13 +249,11 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net)
error = make_socks(serv, net);
if (error < 0)
- goto err_socks;
+ goto err_bind;
set_grace_period(net);
dprintk("lockd_up_net: per-net data created; net=%p\n", net);
return 0;
-err_socks:
- svc_rpcb_cleanup(serv, net);
err_bind:
ln->nlmsvc_users--;
return error;
@@ -348,6 +342,10 @@ static struct svc_serv *lockd_create_svc(void)
printk(KERN_WARNING
"lockd_up: no pid, %d users??\n", nlmsvc_users);
+ if (!nlm_timeout)
+ nlm_timeout = LOCKD_DFLT_TIMEO;
+ nlmsvc_timeout = nlm_timeout * HZ;
+
serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
diff --git a/fs/locks.c b/fs/locks.c
index 4dd39b98a6a3..2c61c4e9368c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2235,16 +2235,28 @@ void locks_remove_flock(struct file *filp)
while ((fl = *before) != NULL) {
if (fl->fl_file == filp) {
- if (IS_FLOCK(fl)) {
- locks_delete_lock(before);
- continue;
- }
if (IS_LEASE(fl)) {
lease_modify(before, F_UNLCK);
continue;
}
- /* What? */
- BUG();
+
+ /*
+ * There's a leftover lock on the list of a type that
+ * we didn't expect to see. Most likely a classic
+ * POSIX lock that ended up not getting released
+ * properly, or that raced onto the list somehow. Log
+ * some info about it and then just remove it from
+ * the list.
+ */
+ WARN(!IS_FLOCK(fl),
+ "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n",
+ MAJOR(inode->i_sb->s_dev),
+ MINOR(inode->i_sb->s_dev), inode->i_ino,
+ fl->fl_type, fl->fl_flags,
+ fl->fl_start, fl->fl_end);
+
+ locks_delete_lock(before);
+ continue;
}
before = &fl->fl_next;
}
diff --git a/fs/namei.c b/fs/namei.c
index d5a4faeb39a5..0fbf1504fb63 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -415,6 +415,7 @@ int __inode_permission(struct inode *inode, int mask)
return security_inode_permission(inode, mask);
}
+EXPORT_SYMBOL(__inode_permission);
/**
* sb_permission - Check superblock-level permissions
@@ -642,24 +643,22 @@ static int complete_walk(struct nameidata *nd)
static __always_inline void set_root(struct nameidata *nd)
{
- if (!nd->root.mnt)
- get_fs_root(current->fs, &nd->root);
+ get_fs_root(current->fs, &nd->root);
}
static int link_path_walk(const char *, struct nameidata *);
-static __always_inline void set_root_rcu(struct nameidata *nd)
+static __always_inline unsigned set_root_rcu(struct nameidata *nd)
{
- if (!nd->root.mnt) {
- struct fs_struct *fs = current->fs;
- unsigned seq;
+ struct fs_struct *fs = current->fs;
+ unsigned seq, res;
- do {
- seq = read_seqcount_begin(&fs->seq);
- nd->root = fs->root;
- nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
- } while (read_seqcount_retry(&fs->seq, seq));
- }
+ do {
+ seq = read_seqcount_begin(&fs->seq);
+ nd->root = fs->root;
+ res = __read_seqcount_begin(&nd->root.dentry->d_seq);
+ } while (read_seqcount_retry(&fs->seq, seq));
+ return res;
}
static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -859,7 +858,8 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
return PTR_ERR(s);
}
if (*s == '/') {
- set_root(nd);
+ if (!nd->root.mnt)
+ set_root(nd);
path_put(&nd->path);
nd->path = nd->root;
path_get(&nd->root);
@@ -1132,7 +1132,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
static int follow_dotdot_rcu(struct nameidata *nd)
{
- set_root_rcu(nd);
+ if (!nd->root.mnt)
+ set_root_rcu(nd);
while (1) {
if (nd->path.dentry == nd->root.dentry &&
@@ -1244,7 +1245,8 @@ static void follow_mount(struct path *path)
static void follow_dotdot(struct nameidata *nd)
{
- set_root(nd);
+ if (!nd->root.mnt)
+ set_root(nd);
while(1) {
struct dentry *old = nd->path.dentry;
@@ -1842,7 +1844,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
if (*name=='/') {
if (flags & LOOKUP_RCU) {
rcu_read_lock();
- set_root_rcu(nd);
+ nd->seq = set_root_rcu(nd);
} else {
set_root(nd);
path_get(&nd->root);
@@ -2358,22 +2360,17 @@ kern_path_mountpoint(int dfd, const char *name, struct path *path,
}
EXPORT_SYMBOL(kern_path_mountpoint);
-/*
- * It's inline, so penalty for filesystems that don't use sticky bit is
- * minimal.
- */
-static inline int check_sticky(struct inode *dir, struct inode *inode)
+int __check_sticky(struct inode *dir, struct inode *inode)
{
kuid_t fsuid = current_fsuid();
- if (!(dir->i_mode & S_ISVTX))
- return 0;
if (uid_eq(inode->i_uid, fsuid))
return 0;
if (uid_eq(dir->i_uid, fsuid))
return 0;
return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
}
+EXPORT_SYMBOL(__check_sticky);
/*
* Check whether we can remove a link victim from directory dir, check
@@ -2476,7 +2473,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
}
mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
+ mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2);
return NULL;
}
@@ -3036,9 +3033,12 @@ finish_open_created:
error = may_open(&nd->path, acc_mode, open_flag);
if (error)
goto out;
- file->f_path.mnt = nd->path.mnt;
- error = finish_open(file, nd->path.dentry, NULL, opened);
- if (error) {
+
+ BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
+ error = vfs_open(&nd->path, file, current_cred());
+ if (!error) {
+ *opened |= FILE_OPENED;
+ } else {
if (error == -EOPENSTALE)
goto stale_open;
goto out;
@@ -3127,7 +3127,8 @@ static int do_tmpfile(int dfd, struct filename *pathname,
if (error)
goto out2;
audit_inode(pathname, nd->path.dentry, 0);
- error = may_open(&nd->path, op->acc_mode, op->open_flag);
+ /* Don't check for other permissions, the inode was just created */
+ error = may_open(&nd->path, MAY_OPEN, op->open_flag);
if (error)
goto out2;
file->f_path.mnt = nd->path.mnt;
@@ -3975,7 +3976,28 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
}
-/*
+/**
+ * vfs_rename - rename a filesystem object
+ * @old_dir: parent of source
+ * @old_dentry: source
+ * @new_dir: parent of destination
+ * @new_dentry: destination
+ * @delegated_inode: returns an inode needing a delegation break
+ * @flags: rename flags
+ *
+ * The caller must hold multiple mutexes--see lock_rename()).
+ *
+ * If vfs_rename discovers a delegation in need of breaking at either
+ * the source or destination, it will return -EWOULDBLOCK and return a
+ * reference to the inode in delegated_inode. The caller should then
+ * break the delegation and retry. Because breaking a delegation may
+ * take a long time, the caller should drop all locks before doing
+ * so.
+ *
+ * Alternatively, a caller may pass NULL for delegated_inode. This may
+ * be appropriate for callers that expect the underlying filesystem not
+ * to be NFS exported.
+ *
* The worst of all namespace operations - renaming directory. "Perverted"
* doesn't even start to describe it. Somebody in UCB had a heck of a trip...
* Problems:
@@ -4003,163 +4025,140 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* ->i_mutex on parents, which works but leads to some truly excessive
* locking].
*/
-static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ struct inode **delegated_inode, unsigned int flags)
{
- int error = 0;
+ int error;
+ bool is_dir = d_is_dir(old_dentry);
+ const unsigned char *old_name;
+ struct inode *source = old_dentry->d_inode;
struct inode *target = new_dentry->d_inode;
+ bool new_is_dir = false;
unsigned max_links = new_dir->i_sb->s_max_links;
+ if (source == target)
+ return 0;
+
+ error = may_delete(old_dir, old_dentry, is_dir);
+ if (error)
+ return error;
+
+ if (!target) {
+ error = may_create(new_dir, new_dentry);
+ } else {
+ new_is_dir = d_is_dir(new_dentry);
+
+ if (!(flags & RENAME_EXCHANGE))
+ error = may_delete(new_dir, new_dentry, is_dir);
+ else
+ error = may_delete(new_dir, new_dentry, new_is_dir);
+ }
+ if (error)
+ return error;
+
+ if (!old_dir->i_op->rename && !old_dir->i_op->rename2)
+ return -EPERM;
+
+ if (flags && !old_dir->i_op->rename2)
+ return -EINVAL;
+
/*
* If we are going to change the parent - check write permissions,
* we'll need to flip '..'.
*/
if (new_dir != old_dir) {
- error = inode_permission(old_dentry->d_inode, MAY_WRITE);
- if (error)
- return error;
+ if (is_dir) {
+ error = inode_permission(source, MAY_WRITE);
+ if (error)
+ return error;
+ }
+ if ((flags & RENAME_EXCHANGE) && new_is_dir) {
+ error = inode_permission(target, MAY_WRITE);
+ if (error)
+ return error;
+ }
}
- error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
+ error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
+ flags);
if (error)
return error;
+ old_name = fsnotify_oldname_init(old_dentry->d_name.name);
dget(new_dentry);
- if (target)
+ if (!is_dir || (flags & RENAME_EXCHANGE))
+ lock_two_nondirectories(source, target);
+ else if (target)
mutex_lock(&target->i_mutex);
error = -EBUSY;
if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
goto out;
- error = -EMLINK;
- if (max_links && !target && new_dir != old_dir &&
- new_dir->i_nlink >= max_links)
- goto out;
-
- if (target)
+ if (max_links && new_dir != old_dir) {
+ error = -EMLINK;
+ if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
+ goto out;
+ if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
+ old_dir->i_nlink >= max_links)
+ goto out;
+ }
+ if (is_dir && !(flags & RENAME_EXCHANGE) && target)
shrink_dcache_parent(new_dentry);
- error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
- if (error)
- goto out;
-
- if (target) {
- target->i_flags |= S_DEAD;
- dont_mount(new_dentry);
+ if (!is_dir) {
+ error = try_break_deleg(source, delegated_inode);
+ if (error)
+ goto out;
}
-out:
- if (target)
- mutex_unlock(&target->i_mutex);
- dput(new_dentry);
- if (!error)
- if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
- d_move(old_dentry,new_dentry);
- return error;
-}
-
-static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- struct inode **delegated_inode)
-{
- struct inode *target = new_dentry->d_inode;
- struct inode *source = old_dentry->d_inode;
- int error;
-
- error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
- if (error)
- return error;
-
- dget(new_dentry);
- lock_two_nondirectories(source, target);
-
- error = -EBUSY;
- if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
- goto out;
-
- error = try_break_deleg(source, delegated_inode);
- if (error)
- goto out;
- if (target) {
+ if (target && !new_is_dir) {
error = try_break_deleg(target, delegated_inode);
if (error)
goto out;
}
- error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+ if (!old_dir->i_op->rename2) {
+ error = old_dir->i_op->rename(old_dir, old_dentry,
+ new_dir, new_dentry);
+ } else {
+ WARN_ON(old_dir->i_op->rename != NULL);
+ error = old_dir->i_op->rename2(old_dir, old_dentry,
+ new_dir, new_dentry, flags);
+ }
if (error)
goto out;
- if (target)
+ if (!(flags & RENAME_EXCHANGE) && target) {
+ if (is_dir)
+ target->i_flags |= S_DEAD;
dont_mount(new_dentry);
- if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
- d_move(old_dentry, new_dentry);
+ }
+ if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
+ if (!(flags & RENAME_EXCHANGE))
+ d_move(old_dentry, new_dentry);
+ else
+ d_exchange(old_dentry, new_dentry);
+ }
out:
- unlock_two_nondirectories(source, target);
+ if (!is_dir || (flags & RENAME_EXCHANGE))
+ unlock_two_nondirectories(source, target);
+ else if (target)
+ mutex_unlock(&target->i_mutex);
dput(new_dentry);
- return error;
-}
-
-/**
- * vfs_rename - rename a filesystem object
- * @old_dir: parent of source
- * @old_dentry: source
- * @new_dir: parent of destination
- * @new_dentry: destination
- * @delegated_inode: returns an inode needing a delegation break
- *
- * The caller must hold multiple mutexes--see lock_rename()).
- *
- * If vfs_rename discovers a delegation in need of breaking at either
- * the source or destination, it will return -EWOULDBLOCK and return a
- * reference to the inode in delegated_inode. The caller should then
- * break the delegation and retry. Because breaking a delegation may
- * take a long time, the caller should drop all locks before doing
- * so.
- *
- * Alternatively, a caller may pass NULL for delegated_inode. This may
- * be appropriate for callers that expect the underlying filesystem not
- * to be NFS exported.
- */
-int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- struct inode **delegated_inode)
-{
- int error;
- int is_dir = d_is_dir(old_dentry);
- const unsigned char *old_name;
-
- if (old_dentry->d_inode == new_dentry->d_inode)
- return 0;
-
- error = may_delete(old_dir, old_dentry, is_dir);
- if (error)
- return error;
-
- if (!new_dentry->d_inode)
- error = may_create(new_dir, new_dentry);
- else
- error = may_delete(new_dir, new_dentry, is_dir);
- if (error)
- return error;
-
- if (!old_dir->i_op->rename)
- return -EPERM;
-
- old_name = fsnotify_oldname_init(old_dentry->d_name.name);
-
- if (is_dir)
- error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
- else
- error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,delegated_inode);
- if (!error)
+ if (!error) {
fsnotify_move(old_dir, new_dir, old_name, is_dir,
- new_dentry->d_inode, old_dentry);
+ !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
+ if (flags & RENAME_EXCHANGE) {
+ fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
+ new_is_dir, NULL, new_dentry);
+ }
+ }
fsnotify_oldname_free(old_name);
return error;
}
-SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
- int, newdfd, const char __user *, newname)
+SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
+ int, newdfd, const char __user *, newname, unsigned int, flags)
{
struct dentry *old_dir, *new_dir;
struct dentry *old_dentry, *new_dentry;
@@ -4171,6 +4170,17 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
unsigned int lookup_flags = 0;
bool should_retry = false;
int error;
+
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
+ return -EINVAL;
+
+ if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
+ (flags & RENAME_EXCHANGE))
+ return -EINVAL;
+
+ if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
+ return -EPERM;
+
retry:
from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
if (IS_ERR(from)) {
@@ -4194,6 +4204,8 @@ retry:
goto exit2;
new_dir = newnd.path.dentry;
+ if (flags & RENAME_NOREPLACE)
+ error = -EEXIST;
if (newnd.last_type != LAST_NORM)
goto exit2;
@@ -4203,7 +4215,8 @@ retry:
oldnd.flags &= ~LOOKUP_PARENT;
newnd.flags &= ~LOOKUP_PARENT;
- newnd.flags |= LOOKUP_RENAME_TARGET;
+ if (!(flags & RENAME_EXCHANGE))
+ newnd.flags |= LOOKUP_RENAME_TARGET;
retry_deleg:
trap = lock_rename(new_dir, old_dir);
@@ -4216,34 +4229,49 @@ retry_deleg:
error = -ENOENT;
if (d_is_negative(old_dentry))
goto exit4;
+ new_dentry = lookup_hash(&newnd);
+ error = PTR_ERR(new_dentry);
+ if (IS_ERR(new_dentry))
+ goto exit4;
+ error = -EEXIST;
+ if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
+ goto exit5;
+ if (flags & RENAME_EXCHANGE) {
+ error = -ENOENT;
+ if (d_is_negative(new_dentry))
+ goto exit5;
+
+ if (!d_is_dir(new_dentry)) {
+ error = -ENOTDIR;
+ if (newnd.last.name[newnd.last.len])
+ goto exit5;
+ }
+ }
/* unless the source is a directory trailing slashes give -ENOTDIR */
if (!d_is_dir(old_dentry)) {
error = -ENOTDIR;
if (oldnd.last.name[oldnd.last.len])
- goto exit4;
- if (newnd.last.name[newnd.last.len])
- goto exit4;
+ goto exit5;
+ if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len])
+ goto exit5;
}
/* source should not be ancestor of target */
error = -EINVAL;
if (old_dentry == trap)
- goto exit4;
- new_dentry = lookup_hash(&newnd);
- error = PTR_ERR(new_dentry);
- if (IS_ERR(new_dentry))
- goto exit4;
+ goto exit5;
/* target should not be an ancestor of source */
- error = -ENOTEMPTY;
+ if (!(flags & RENAME_EXCHANGE))
+ error = -ENOTEMPTY;
if (new_dentry == trap)
goto exit5;
error = security_path_rename(&oldnd.path, old_dentry,
- &newnd.path, new_dentry);
+ &newnd.path, new_dentry, flags);
if (error)
goto exit5;
error = vfs_rename(old_dir->d_inode, old_dentry,
- new_dir->d_inode, new_dentry,
- &delegated_inode);
+ new_dir->d_inode, new_dentry,
+ &delegated_inode, flags);
exit5:
dput(new_dentry);
exit4:
@@ -4273,16 +4301,34 @@ exit:
return error;
}
+SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
+ int, newdfd, const char __user *, newname)
+{
+ return sys_renameat2(olddfd, oldname, newdfd, newname, 0);
+}
+
SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
{
- return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
+ return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
}
-int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
+int vfs_whiteout(struct inode *dir, struct dentry *dentry)
{
- int len;
+ int error = may_create(dir, dentry);
+ if (error)
+ return error;
- len = PTR_ERR(link);
+ if (!dir->i_op->mknod)
+ return -EPERM;
+
+ return dir->i_op->mknod(dir, dentry,
+ S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
+}
+EXPORT_SYMBOL(vfs_whiteout);
+
+int readlink_copy(char __user *buffer, int buflen, const char *link)
+{
+ int len = PTR_ERR(link);
if (IS_ERR(link))
goto out;
@@ -4311,7 +4357,7 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
if (IS_ERR(cookie))
return PTR_ERR(cookie);
- res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
+ res = readlink_copy(buffer, buflen, nd_get_link(&nd));
if (dentry->d_inode->i_op->put_link)
dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
return res;
@@ -4335,8 +4381,7 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
struct page *page = NULL;
- char *s = page_getlink(dentry, &page);
- int res = vfs_readlink(dentry,buffer,buflen,s);
+ int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
if (page) {
kunmap(page);
page_cache_release(page);
@@ -4432,7 +4477,6 @@ EXPORT_SYMBOL(vfs_link);
EXPORT_SYMBOL(vfs_mkdir);
EXPORT_SYMBOL(vfs_mknod);
EXPORT_SYMBOL(generic_permission);
-EXPORT_SYMBOL(vfs_readlink);
EXPORT_SYMBOL(vfs_rename);
EXPORT_SYMBOL(vfs_rmdir);
EXPORT_SYMBOL(vfs_symlink);
diff --git a/fs/namespace.c b/fs/namespace.c
index 75536db4b69b..c6533ce9cdee 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1295,6 +1295,8 @@ void umount_tree(struct mount *mnt, int how)
}
if (last) {
last->mnt_hash.next = unmounted.first;
+ if (unmounted.first)
+ unmounted.first->pprev = &last->mnt_hash.next;
unmounted.first = tmp_list.first;
unmounted.first->pprev = &unmounted.first;
}
@@ -1365,6 +1367,8 @@ static int do_umount(struct mount *mnt, int flags)
* Special case for "unmounting" root ...
* we just try to remount it readonly.
*/
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
down_write(&sb->s_umount);
if (!(sb->s_flags & MS_RDONLY))
retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
@@ -1437,6 +1441,9 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
goto dput_and_out;
if (mnt->mnt.mnt_flags & MNT_LOCKED)
goto dput_and_out;
+ retval = -EPERM;
+ if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
+ goto dput_and_out;
retval = do_umount(mnt, flags);
dput_and_out:
@@ -1579,6 +1586,33 @@ void drop_collected_mounts(struct vfsmount *mnt)
namespace_unlock();
}
+/**
+ * clone_private_mount - create a private clone of a path
+ *
+ * This creates a new vfsmount, which will be the clone of @path. The new will
+ * not be attached anywhere in the namespace and will be private (i.e. changes
+ * to the originating mount won't be propagated into this).
+ *
+ * Release with mntput().
+ */
+struct vfsmount *clone_private_mount(struct path *path)
+{
+ struct mount *old_mnt = real_mount(path->mnt);
+ struct mount *new_mnt;
+
+ if (IS_MNT_UNBINDABLE(old_mnt))
+ return ERR_PTR(-EINVAL);
+
+ down_read(&namespace_sem);
+ new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
+ up_read(&namespace_sem);
+ if (IS_ERR(new_mnt))
+ return ERR_CAST(new_mnt);
+
+ return &new_mnt->mnt;
+}
+EXPORT_SYMBOL_GPL(clone_private_mount);
+
int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
struct vfsmount *root)
{
@@ -1962,7 +1996,13 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
}
if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
!(mnt_flags & MNT_NODEV)) {
- return -EPERM;
+ /* Was the nodev implicitly added in mount? */
+ if ((mnt->mnt_ns->user_ns != &init_user_ns) &&
+ !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {
+ mnt_flags |= MNT_NODEV;
+ } else {
+ return -EPERM;
+ }
}
if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
!(mnt_flags & MNT_NOSUID)) {
@@ -2829,6 +2869,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
/* make sure we can reach put_old from new_root */
if (!is_path_reachable(old_mnt, old.dentry, &new))
goto out4;
+ /* make certain new is below the root */
+ if (!is_path_reachable(new_mnt, new.dentry, &root))
+ goto out4;
root_mp->m_count++; /* pin it so it won't go away */
lock_mount_hash();
detach_mnt(new_mnt, &parent_path);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 60426ccb3b65..2f970de02b16 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -448,7 +448,6 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
result = -EIO;
}
}
- result = 0;
}
mutex_unlock(&server->root_setup_lock);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 56ff823ca82e..65d849bdf77a 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1213,7 +1213,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
if (end != NFS_I(inode)->npages) {
rcu_read_lock();
- end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX);
+ end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
rcu_read_unlock();
}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 073b4cf67ed9..0a2016bd6e58 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -128,22 +128,24 @@ nfs41_callback_svc(void *vrqstp)
if (try_to_freeze())
continue;
- prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+ prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_UNINTERRUPTIBLE);
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
req = list_first_entry(&serv->sv_cb_list,
struct rpc_rqst, rq_bc_list);
list_del(&req->rq_bc_list);
spin_unlock_bh(&serv->sv_cb_lock);
+ finish_wait(&serv->sv_cb_waitq, &wq);
dprintk("Invoking bc_svc_process()\n");
error = bc_svc_process(serv, req, rqstp);
dprintk("bc_svc_process() returned w/ error code= %d\n",
error);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
- schedule();
+ /* schedule_timeout to game the hung task watchdog */
+ schedule_timeout(60 * HZ);
+ finish_wait(&serv->sv_cb_waitq, &wq);
}
- finish_wait(&serv->sv_cb_waitq, &wq);
}
return 0;
}
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index f4ccfe6521ec..02f8d09e119f 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -464,8 +464,10 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
for (i = 0; i < args->csa_nrclists; i++) {
status = decode_rc_list(xdr, &args->csa_rclists[i]);
- if (status)
+ if (status) {
+ args->csa_nrclists = i;
goto out_free;
+ }
}
}
status = 0;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5d8ccecf5f5c..2ea3537b8bde 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -109,6 +109,8 @@ again:
continue;
if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
continue;
+ if (!nfs4_valid_open_stateid(state))
+ continue;
if (!nfs4_stateid_match(&state->stateid, stateid))
continue;
get_nfs_open_context(ctx);
@@ -159,8 +161,8 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
&delegation->flags);
NFS_I(inode)->delegation_state = delegation->type;
spin_unlock(&delegation->lock);
- put_rpccred(oldcred);
rcu_read_unlock();
+ put_rpccred(oldcred);
trace_nfs4_reclaim_delegation(inode, res->delegation_type);
} else {
/* We appear to have raced with a delegation return. */
@@ -177,7 +179,11 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
{
int res = 0;
- res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync);
+ if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ res = nfs4_proc_delegreturn(inode,
+ delegation->cred,
+ &delegation->stateid,
+ issync);
nfs_free_delegation(delegation);
return res;
}
@@ -364,11 +370,13 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation
{
struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
struct nfs_inode *nfsi = NFS_I(inode);
- int err;
+ int err = 0;
if (delegation == NULL)
return 0;
do {
+ if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ break;
err = nfs_delegation_claim_opens(inode, &delegation->stateid);
if (!issync || err != -EAGAIN)
break;
@@ -589,10 +597,23 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl
rcu_read_unlock();
}
+static void nfs_revoke_delegation(struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation != NULL) {
+ set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
+ nfs_mark_return_delegation(NFS_SERVER(inode), delegation);
+ }
+ rcu_read_unlock();
+}
+
void nfs_remove_bad_delegation(struct inode *inode)
{
struct nfs_delegation *delegation;
+ nfs_revoke_delegation(inode);
delegation = nfs_inode_detach_delegation(inode);
if (delegation) {
nfs_inode_find_state_and_recover(inode, &delegation->stateid);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 9a79c7a99d6d..e02b090ab9da 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -31,6 +31,7 @@ enum {
NFS_DELEGATION_RETURN_IF_CLOSED,
NFS_DELEGATION_REFERENCED,
NFS_DELEGATION_RETURNING,
+ NFS_DELEGATION_REVOKED,
};
int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b8797ae6831f..7ececa1c6c4f 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -123,6 +123,12 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
*/
ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+
+ /* we only support swap file calling nfs_direct_IO */
+ if (!IS_SWAPFILE(inode))
+ return 0;
+
#ifndef CONFIG_NFS_SWAP
dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n",
iocb->ki_filp, (long long) pos, nr_segs);
@@ -178,6 +184,7 @@ static void nfs_direct_req_free(struct kref *kref)
{
struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
+ nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
if (dreq->l_ctx != NULL)
nfs_put_lock_context(dreq->l_ctx);
if (dreq->ctx != NULL)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 15f9d98627a4..6659ce545f15 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -592,7 +592,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
- int err;
+ int err = 0;
trace_nfs_getattr_enter(inode);
/* Flush out writes to the server in order to update c/mtime. */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 0e46d3d1b6cc..d3f606255b99 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -482,6 +482,16 @@ int nfs40_walk_client_list(struct nfs_client *new,
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+ if (pos->rpc_ops != new->rpc_ops)
+ continue;
+
+ if (pos->cl_proto != new->cl_proto)
+ continue;
+
+ if (pos->cl_minorversion != new->cl_minorversion)
+ continue;
+
/* If "pos" isn't marked ready, we can't trust the
* remaining fields in "pos" */
if (pos->cl_cons_state > NFS_CS_READY) {
@@ -501,15 +511,6 @@ int nfs40_walk_client_list(struct nfs_client *new,
if (pos->cl_cons_state != NFS_CS_READY)
continue;
- if (pos->rpc_ops != new->rpc_ops)
- continue;
-
- if (pos->cl_proto != new->cl_proto)
- continue;
-
- if (pos->cl_minorversion != new->cl_minorversion)
- continue;
-
if (pos->cl_clientid != new->cl_clientid)
continue;
@@ -564,20 +565,14 @@ static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b)
}
/*
- * Returns true if the server owners match
+ * Returns true if the server major ids match
*/
static bool
-nfs4_match_serverowners(struct nfs_client *a, struct nfs_client *b)
+nfs4_check_clientid_trunking(struct nfs_client *a, struct nfs_client *b)
{
struct nfs41_server_owner *o1 = a->cl_serverowner;
struct nfs41_server_owner *o2 = b->cl_serverowner;
- if (o1->minor_id != o2->minor_id) {
- dprintk("NFS: --> %s server owner minor IDs do not match\n",
- __func__);
- return false;
- }
-
if (o1->major_id_sz != o2->major_id_sz)
goto out_major_mismatch;
if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0)
@@ -615,6 +610,16 @@ int nfs41_walk_client_list(struct nfs_client *new,
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+ if (pos->rpc_ops != new->rpc_ops)
+ continue;
+
+ if (pos->cl_proto != new->cl_proto)
+ continue;
+
+ if (pos->cl_minorversion != new->cl_minorversion)
+ continue;
+
/* If "pos" isn't marked ready, we can't trust the
* remaining fields in "pos", especially the client
* ID and serverowner fields. Wait for CREATE_SESSION
@@ -628,7 +633,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
prev = pos;
status = nfs_wait_client_init_complete(pos);
- if (status == 0) {
+ if (pos->cl_cons_state == NFS_CS_SESSION_INITING) {
nfs4_schedule_lease_recovery(pos);
status = nfs4_wait_clnt_recover(pos);
}
@@ -640,19 +645,15 @@ int nfs41_walk_client_list(struct nfs_client *new,
if (pos->cl_cons_state != NFS_CS_READY)
continue;
- if (pos->rpc_ops != new->rpc_ops)
- continue;
-
- if (pos->cl_proto != new->cl_proto)
- continue;
-
- if (pos->cl_minorversion != new->cl_minorversion)
- continue;
-
if (!nfs4_match_clientids(pos, new))
continue;
- if (!nfs4_match_serverowners(pos, new))
+ /*
+ * Note that session trunking is just a special subcase of
+ * client id trunking. In either case, we want to fall back
+ * to using the existing nfs_client.
+ */
+ if (!nfs4_check_clientid_trunking(pos, new))
continue;
atomic_inc(&pos->cl_count);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 17f91a72840b..58258ad50d5f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1587,7 +1587,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
nfs_inode_find_state_and_recover(state->inode,
stateid);
nfs4_schedule_stateid_recovery(server, state);
- return 0;
+ return -EAGAIN;
case -NFS4ERR_DELAY:
case -NFS4ERR_GRACE:
set_bit(NFS_DELEGATED_STATE, &state->flags);
@@ -2034,46 +2034,60 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
return ret;
}
+static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
+{
+ nfs_remove_bad_delegation(state->inode);
+ write_seqlock(&state->seqlock);
+ nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+ write_sequnlock(&state->seqlock);
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+}
+
+static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
+{
+ if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
+ nfs_finish_clear_delegation_stateid(state);
+}
+
+static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ /* NFSv4.0 doesn't allow for delegation recovery on open expire */
+ nfs40_clear_delegation_stateid(state);
+ return nfs4_open_expired(sp, state);
+}
+
#if defined(CONFIG_NFS_V4_1)
-static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
+static void nfs41_check_delegation_stateid(struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(state->inode);
- nfs4_stateid *stateid = &state->stateid;
+ nfs4_stateid stateid;
struct nfs_delegation *delegation;
- struct rpc_cred *cred = NULL;
- int status = -NFS4ERR_BAD_STATEID;
-
- /* If a state reset has been done, test_stateid is unneeded */
- if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
- return;
+ struct rpc_cred *cred;
+ int status;
/* Get the delegation credential for use by test/free_stateid */
rcu_read_lock();
delegation = rcu_dereference(NFS_I(state->inode)->delegation);
- if (delegation != NULL &&
- nfs4_stateid_match(&delegation->stateid, stateid)) {
- cred = get_rpccred(delegation->cred);
- rcu_read_unlock();
- status = nfs41_test_stateid(server, stateid, cred);
- trace_nfs4_test_delegation_stateid(state, NULL, status);
- } else
+ if (delegation == NULL) {
rcu_read_unlock();
+ return;
+ }
+
+ nfs4_stateid_copy(&stateid, &delegation->stateid);
+ cred = get_rpccred(delegation->cred);
+ rcu_read_unlock();
+ status = nfs41_test_stateid(server, &stateid, cred);
+ trace_nfs4_test_delegation_stateid(state, NULL, status);
if (status != NFS_OK) {
/* Free the stateid unless the server explicitly
* informs us the stateid is unrecognized. */
if (status != -NFS4ERR_BAD_STATEID)
- nfs41_free_stateid(server, stateid, cred);
- nfs_remove_bad_delegation(state->inode);
-
- write_seqlock(&state->seqlock);
- nfs4_stateid_copy(&state->stateid, &state->open_stateid);
- write_sequnlock(&state->seqlock);
- clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ nfs41_free_stateid(server, &stateid, cred);
+ nfs_finish_clear_delegation_stateid(state);
}
- if (cred != NULL)
- put_rpccred(cred);
+ put_rpccred(cred);
}
/**
@@ -2117,7 +2131,7 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
{
int status;
- nfs41_clear_delegation_stateid(state);
+ nfs41_check_delegation_stateid(state);
status = nfs41_check_open_stateid(state);
if (status != NFS_OK)
status = nfs4_open_expired(sp, state);
@@ -2558,23 +2572,23 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
- /* Calculate the current open share mode */
- calldata->arg.fmode = 0;
- if (is_rdonly || is_rdwr)
- calldata->arg.fmode |= FMODE_READ;
- if (is_wronly || is_rdwr)
- calldata->arg.fmode |= FMODE_WRITE;
/* Calculate the change in open mode */
+ calldata->arg.fmode = 0;
if (state->n_rdwr == 0) {
- if (state->n_rdonly == 0) {
- call_close |= is_rdonly || is_rdwr;
- calldata->arg.fmode &= ~FMODE_READ;
- }
- if (state->n_wronly == 0) {
- call_close |= is_wronly || is_rdwr;
- calldata->arg.fmode &= ~FMODE_WRITE;
- }
- }
+ if (state->n_rdonly == 0)
+ call_close |= is_rdonly;
+ else if (is_rdonly)
+ calldata->arg.fmode |= FMODE_READ;
+ if (state->n_wronly == 0)
+ call_close |= is_wronly;
+ else if (is_wronly)
+ calldata->arg.fmode |= FMODE_WRITE;
+ } else if (is_rdwr)
+ calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
+
+ if (calldata->arg.fmode == 0)
+ call_close |= is_rdwr;
+
if (!nfs4_valid_open_stateid(state))
call_close = 0;
spin_unlock(&state->owner->so_lock);
@@ -7242,7 +7256,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
int ret = 0;
if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
- return 0;
+ return -EAGAIN;
task = _nfs41_proc_sequence(clp, cred, false);
if (IS_ERR(task))
ret = PTR_ERR(task);
@@ -7575,6 +7589,9 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
dprintk("--> %s\n", __func__);
+ /* nfs4_layoutget_release calls pnfs_put_layout_hdr */
+ pnfs_get_layout_hdr(NFS_I(inode)->layout);
+
lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
if (!lgp->args.layout.pages) {
nfs4_layoutget_release(lgp);
@@ -7587,9 +7604,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
lgp->res.seq_res.sr_slot = NULL;
nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
- /* nfs4_layoutget_release calls pnfs_put_layout_hdr */
- pnfs_get_layout_hdr(NFS_I(inode)->layout);
-
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return ERR_CAST(task);
@@ -8255,7 +8269,7 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
.state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
- .recover_open = nfs4_open_expired,
+ .recover_open = nfs40_open_expired,
.recover_lock = nfs4_lock_expired,
.establish_clid = nfs4_init_clientid,
};
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 1720d32ffa54..e1ba58c3d1ad 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -88,10 +88,18 @@ nfs4_renew_state(struct work_struct *work)
}
nfs_expire_all_delegations(clp);
} else {
+ int ret;
+
/* Queue an asynchronous RENEW. */
- ops->sched_state_renewal(clp, cred, renew_flags);
+ ret = ops->sched_state_renewal(clp, cred, renew_flags);
put_rpccred(cred);
- goto out_exp;
+ switch (ret) {
+ default:
+ goto out_exp;
+ case -EAGAIN:
+ case -ENOMEM:
+ break;
+ }
}
} else {
dprintk("%s: failed to call renewd. Reason: lease not expired \n",
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 27f5f858502b..b4f177f1d405 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1732,7 +1732,8 @@ restart:
if (status < 0) {
set_bit(ops->owner_flag_bit, &sp->so_flags);
nfs4_put_state_owner(sp);
- return nfs4_recovery_handle_error(clp, status);
+ status = nfs4_recovery_handle_error(clp, status);
+ return (status != 0) ? status : -EAGAIN;
}
nfs4_put_state_owner(sp);
@@ -1741,7 +1742,7 @@ restart:
spin_unlock(&clp->cl_lock);
}
rcu_read_unlock();
- return status;
+ return 0;
}
static int nfs4_check_lease(struct nfs_client *clp)
@@ -1788,7 +1789,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
break;
case -NFS4ERR_STALE_CLIENTID:
clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
- nfs4_state_clear_reclaim_reboot(clp);
nfs4_state_start_reclaim_reboot(clp);
break;
case -NFS4ERR_CLID_INUSE:
@@ -2370,6 +2370,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
status = nfs4_check_lease(clp);
if (status < 0)
goto out_error;
+ continue;
}
if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
@@ -2391,14 +2392,11 @@ static void nfs4_state_manager(struct nfs_client *clp)
section = "reclaim reboot";
status = nfs4_do_reclaim(clp,
clp->cl_mvops->reboot_recovery_ops);
- if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
- test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
- continue;
- nfs4_state_end_reclaim_reboot(clp);
- if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
+ if (status == -EAGAIN)
continue;
if (status < 0)
goto out_error;
+ nfs4_state_end_reclaim_reboot(clp);
}
/* Now recover expired state... */
@@ -2406,9 +2404,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
section = "reclaim nograce";
status = nfs4_do_reclaim(clp,
clp->cl_mvops->nograce_recovery_ops);
- if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
- test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
- test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+ if (status == -EAGAIN)
continue;
if (status < 0)
goto out_error;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 2ffebf2081ce..27d7f2742592 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -113,7 +113,7 @@ __nfs_iocounter_wait(struct nfs_io_counter *c)
if (atomic_read(&c->io_count) == 0)
break;
ret = nfs_wait_bit_killable(&c->flags);
- } while (atomic_read(&c->io_count) != 0);
+ } while (atomic_read(&c->io_count) != 0 && !ret);
finish_wait(wq, &q.wait);
return ret;
}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index cc8c5b32043c..f42bbe5fbc0a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -784,8 +784,12 @@ static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
{
if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
- dprintk("%s slot is busy\n", __func__);
- return false;
+ /* Race breaker */
+ if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
+ dprintk("%s slot is busy\n", __func__);
+ return false;
+ }
+ rpc_wake_up_queued_task(&clp->cl_cb_waitq, task);
}
return true;
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index f23a6ca37504..86f5d3e474bf 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1243,7 +1243,8 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp)
*/
if (argp->opcnt == resp->opcnt)
return false;
-
+ if (next->opnum == OP_ILLEGAL)
+ return false;
nextd = OPDESC(next);
/*
* Rest of 2.6.3.1.1: certain operations will return WRONGSEC
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 9c271f42604a..674a5d529e09 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -244,10 +244,11 @@ struct nfs4_dir_ctx {
};
static int
-nfsd4_build_namelist(void *arg, const char *name, int namlen,
+nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
- struct nfs4_dir_ctx *ctx = arg;
+ struct nfs4_dir_ctx *ctx =
+ container_of(__ctx, struct nfs4_dir_ctx, ctx);
struct name_list *entry;
if (namlen != HEXDIR_LEN - 1)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 34d2a1f2f400..daa53da1d286 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1209,15 +1209,14 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source)
return 0;
}
-static long long
+static int
compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2)
{
- long long res;
-
- res = o1->len - o2->len;
- if (res)
- return res;
- return (long long)memcmp(o1->data, o2->data, o1->len);
+ if (o1->len < o2->len)
+ return -1;
+ if (o1->len > o2->len)
+ return 1;
+ return memcmp(o1->data, o2->data, o1->len);
}
static int same_name(const char *n1, const char *n2)
@@ -1401,7 +1400,7 @@ add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root)
static struct nfs4_client *
find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root)
{
- long long cmp;
+ int cmp;
struct rb_node *node = root->rb_node;
struct nfs4_client *clp;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 86573350350e..dd1afa38f2ae 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1809,6 +1809,9 @@ static __be32 nfsd4_encode_components_esc(char sep, char *components,
}
else
end++;
+ if (found_esc)
+ end = next;
+
str = end;
}
*pp = p;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index f8f060ffbf4f..6040da8830ff 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -224,13 +224,6 @@ hash_refile(struct svc_cacherep *rp)
hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits));
}
-static inline bool
-nfsd_cache_entry_expired(struct svc_cacherep *rp)
-{
- return rp->c_state != RC_INPROG &&
- time_after(jiffies, rp->c_timestamp + RC_EXPIRE);
-}
-
/*
* Walk the LRU list and prune off entries that are older than RC_EXPIRE.
* Also prune the oldest ones when the total exceeds the max number of entries.
@@ -242,8 +235,14 @@ prune_cache_entries(void)
long freed = 0;
list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) {
- if (!nfsd_cache_entry_expired(rp) &&
- num_drc_entries <= max_drc_entries)
+ /*
+ * Don't free entries attached to calls that are still
+ * in-progress, but do keep scanning the list.
+ */
+ if (rp->c_state == RC_INPROG)
+ continue;
+ if (num_drc_entries <= max_drc_entries &&
+ time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
break;
nfsd_reply_cache_free_locked(rp);
freed++;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 479eb681c27c..f417fef17118 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -328,12 +328,15 @@ void nfsd_lockd_shutdown(void);
(NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
- (NFSD4_1_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SECURITY_LABEL)
+#define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL
#else
-#define NFSD4_2_SUPPORTED_ATTRS_WORD2 0
+#define NFSD4_2_SECURITY_ATTRS 0
#endif
+#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
+ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \
+ NFSD4_2_SECURITY_ATTRS)
+
static inline u32 nfsd_suppattrs0(u32 minorversion)
{
return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index eea5ad188984..3953a20db16d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1703,7 +1703,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
goto out_dput_new;
- host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL);
+ host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
if (!host_err) {
host_err = commit_metadata(tfhp);
if (!host_err)
@@ -1808,10 +1808,12 @@ struct readdir_data {
int full;
};
-static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
- loff_t offset, u64 ino, unsigned int d_type)
+static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
+ int namlen, loff_t offset, u64 ino,
+ unsigned int d_type)
{
- struct readdir_data *buf = __buf;
+ struct readdir_data *buf =
+ container_of(ctx, struct readdir_data, ctx);
struct buffered_dirent *de = (void *)(buf->dirent + buf->used);
unsigned int reclen;
@@ -1831,7 +1833,7 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
return 0;
}
-static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
+static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
struct readdir_cd *cdp, loff_t *offsetp)
{
struct buffered_dirent *de;
@@ -1915,7 +1917,7 @@ static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
*/
__be32
nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
- struct readdir_cd *cdp, filldir_t func)
+ struct readdir_cd *cdp, nfsd_filldir_t func)
{
__be32 err;
struct file *file;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index fbe90bdb2214..ea760b17ecbd 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -36,7 +36,7 @@
/*
* Callback function for readdir
*/
-typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
+typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
/* nfsd/vfs.c */
int nfsd_racache_init(int);
@@ -89,7 +89,7 @@ __be32 nfsd_rename(struct svc_rqst *,
__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
char *name, int len);
__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *,
- loff_t *, struct readdir_cd *, filldir_t);
+ loff_t *, struct readdir_cd *, nfsd_filldir_t);
__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
struct kstatfs *, int access);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index b2e3ff347620..ecdbae19a766 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -31,6 +31,8 @@
#include "alloc.h"
#include "dat.h"
+static void __nilfs_btree_init(struct nilfs_bmap *bmap);
+
static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
{
struct nilfs_btree_path *path;
@@ -368,6 +370,34 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
return ret;
}
+/**
+ * nilfs_btree_root_broken - verify consistency of btree root node
+ * @node: btree root node to be examined
+ * @ino: inode number
+ *
+ * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
+ */
+static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
+ unsigned long ino)
+{
+ int level, flags, nchildren;
+ int ret = 0;
+
+ level = nilfs_btree_node_get_level(node);
+ flags = nilfs_btree_node_get_flags(node);
+ nchildren = nilfs_btree_node_get_nchildren(node);
+
+ if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
+ level > NILFS_BTREE_LEVEL_MAX ||
+ nchildren < 0 ||
+ nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
+ pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n",
+ ino, level, flags, nchildren);
+ ret = 1;
+ }
+ return ret;
+}
+
int nilfs_btree_broken_node_block(struct buffer_head *bh)
{
int ret;
@@ -1713,7 +1743,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
/* convert and insert */
dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
- nilfs_btree_init(btree);
+ __nilfs_btree_init(btree);
if (nreq != NULL) {
nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
nilfs_bmap_commit_alloc_ptr(btree, nreq, dat);
@@ -2294,12 +2324,23 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
.bop_gather_data = NULL,
};
-int nilfs_btree_init(struct nilfs_bmap *bmap)
+static void __nilfs_btree_init(struct nilfs_bmap *bmap)
{
bmap->b_ops = &nilfs_btree_ops;
bmap->b_nchildren_per_block =
NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
- return 0;
+}
+
+int nilfs_btree_init(struct nilfs_bmap *bmap)
+{
+ int ret = 0;
+
+ __nilfs_btree_init(bmap);
+
+ if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap),
+ bmap->b_inode->i_ino))
+ ret = -EIO;
+ return ret;
}
void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7e350c562e0e..09480c53fd74 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -24,6 +24,7 @@
#include <linux/buffer_head.h>
#include <linux/gfp.h>
#include <linux/mpage.h>
+#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/aio.h>
#include "nilfs.h"
@@ -48,6 +49,8 @@ struct nilfs_iget_args {
int for_gc;
};
+static int nilfs_iget_test(struct inode *inode, void *opaque);
+
void nilfs_inode_add_blocks(struct inode *inode, int n)
{
struct nilfs_root *root = NILFS_I(inode)->i_root;
@@ -219,10 +222,10 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
static int nilfs_set_page_dirty(struct page *page)
{
+ struct inode *inode = page->mapping->host;
int ret = __set_page_dirty_nobuffers(page);
if (page_has_buffers(page)) {
- struct inode *inode = page->mapping->host;
unsigned nr_dirty = 0;
struct buffer_head *bh, *head;
@@ -245,6 +248,10 @@ static int nilfs_set_page_dirty(struct page *page)
if (nr_dirty)
nilfs_set_file_dirty(inode, nr_dirty);
+ } else if (ret) {
+ unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ nilfs_set_file_dirty(inode, nr_dirty);
}
return ret;
}
@@ -342,6 +349,17 @@ const struct address_space_operations nilfs_aops = {
.is_partially_uptodate = block_is_partially_uptodate,
};
+static int nilfs_insert_inode_locked(struct inode *inode,
+ struct nilfs_root *root,
+ unsigned long ino)
+{
+ struct nilfs_iget_args args = {
+ .ino = ino, .root = root, .cno = 0, .for_gc = 0
+ };
+
+ return insert_inode_locked4(inode, ino, nilfs_iget_test, &args);
+}
+
struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
{
struct super_block *sb = dir->i_sb;
@@ -377,7 +395,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
err = nilfs_bmap_read(ii->i_bmap, NULL);
if (err < 0)
- goto failed_bmap;
+ goto failed_after_creation;
set_bit(NILFS_I_BMAP, &ii->i_state);
/* No lock is needed; iget() ensures it. */
@@ -393,21 +411,24 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
spin_lock(&nilfs->ns_next_gen_lock);
inode->i_generation = nilfs->ns_next_generation++;
spin_unlock(&nilfs->ns_next_gen_lock);
- insert_inode_hash(inode);
+ if (nilfs_insert_inode_locked(inode, root, ino) < 0) {
+ err = -EIO;
+ goto failed_after_creation;
+ }
err = nilfs_init_acl(inode, dir);
if (unlikely(err))
- goto failed_acl; /* never occur. When supporting
+ goto failed_after_creation; /* never occur. When supporting
nilfs_init_acl(), proper cancellation of
above jobs should be considered */
return inode;
- failed_acl:
- failed_bmap:
+ failed_after_creation:
clear_nlink(inode);
+ unlock_new_inode(inode);
iput(inode); /* raw_inode will be deleted through
- generic_delete_inode() */
+ nilfs_evict_inode() */
goto failed;
failed_ifile_create_inode:
@@ -455,8 +476,8 @@ int nilfs_read_inode_common(struct inode *inode,
inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
- if (inode->i_nlink == 0 && inode->i_mode == 0)
- return -EINVAL; /* this inode is deleted */
+ if (inode->i_nlink == 0)
+ return -ESTALE; /* this inode is deleted */
inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
ii->i_flags = le32_to_cpu(raw_inode->i_flags);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 9de78f08989e..0f84b257932c 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -51,9 +51,11 @@ static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
int err = nilfs_add_link(dentry, inode);
if (!err) {
d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
return 0;
}
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput(inode);
return err;
}
@@ -182,6 +184,7 @@ out:
out_fail:
drop_nlink(inode);
nilfs_mark_inode_dirty(inode);
+ unlock_new_inode(inode);
iput(inode);
goto out;
}
@@ -201,11 +204,15 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
inode_inc_link_count(inode);
ihold(inode);
- err = nilfs_add_nondir(dentry, inode);
- if (!err)
+ err = nilfs_add_link(dentry, inode);
+ if (!err) {
+ d_instantiate(dentry, inode);
err = nilfs_transaction_commit(dir->i_sb);
- else
+ } else {
+ inode_dec_link_count(inode);
+ iput(inode);
nilfs_transaction_abort(dir->i_sb);
+ }
return err;
}
@@ -243,6 +250,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
nilfs_mark_inode_dirty(inode);
d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
out:
if (!err)
err = nilfs_transaction_commit(dir->i_sb);
@@ -255,6 +263,7 @@ out_fail:
drop_nlink(inode);
drop_nlink(inode);
nilfs_mark_inode_dirty(inode);
+ unlock_new_inode(inode);
iput(inode);
out_dir:
drop_nlink(dir);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 9bc72dec3fa6..b02c202223a6 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -141,7 +141,6 @@ enum {
* @ti_save: Backup of journal_info field of task_struct
* @ti_flags: Flags
* @ti_count: Nest level
- * @ti_garbage: List of inode to be put when releasing semaphore
*/
struct nilfs_transaction_info {
u32 ti_magic;
@@ -150,7 +149,6 @@ struct nilfs_transaction_info {
one of other filesystems has a bug. */
unsigned short ti_flags;
unsigned short ti_count;
- struct list_head ti_garbage;
};
/* ti_magic */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a1a191634abc..14538a865102 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -305,7 +305,6 @@ static void nilfs_transaction_lock(struct super_block *sb,
ti->ti_count = 0;
ti->ti_save = cur_ti;
ti->ti_magic = NILFS_TI_MAGIC;
- INIT_LIST_HEAD(&ti->ti_garbage);
current->journal_info = ti;
for (;;) {
@@ -332,8 +331,6 @@ static void nilfs_transaction_unlock(struct super_block *sb)
up_write(&nilfs->ns_segctor_sem);
current->journal_info = ti->ti_save;
- if (!list_empty(&ti->ti_garbage))
- nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
}
static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -746,6 +743,15 @@ static void nilfs_dispose_list(struct the_nilfs *nilfs,
}
}
+static void nilfs_iput_work_func(struct work_struct *work)
+{
+ struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info,
+ sc_iput_work);
+ struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+
+ nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0);
+}
+
static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
struct nilfs_root *root)
{
@@ -1899,8 +1905,9 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
struct the_nilfs *nilfs)
{
- struct nilfs_transaction_info *ti = current->journal_info;
struct nilfs_inode_info *ii, *n;
+ int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE);
+ int defer_iput = false;
spin_lock(&nilfs->ns_inode_lock);
list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
@@ -1911,9 +1918,24 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
clear_bit(NILFS_I_BUSY, &ii->i_state);
brelse(ii->i_bh);
ii->i_bh = NULL;
- list_move_tail(&ii->i_dirty, &ti->ti_garbage);
+ list_del_init(&ii->i_dirty);
+ if (!ii->vfs_inode.i_nlink || during_mount) {
+ /*
+ * Defer calling iput() to avoid deadlocks if
+ * i_nlink == 0 or mount is not yet finished.
+ */
+ list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
+ defer_iput = true;
+ } else {
+ spin_unlock(&nilfs->ns_inode_lock);
+ iput(&ii->vfs_inode);
+ spin_lock(&nilfs->ns_inode_lock);
+ }
}
spin_unlock(&nilfs->ns_inode_lock);
+
+ if (defer_iput)
+ schedule_work(&sci->sc_iput_work);
}
/*
@@ -2580,6 +2602,8 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
INIT_LIST_HEAD(&sci->sc_segbufs);
INIT_LIST_HEAD(&sci->sc_write_logs);
INIT_LIST_HEAD(&sci->sc_gc_inodes);
+ INIT_LIST_HEAD(&sci->sc_iput_queue);
+ INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);
init_timer(&sci->sc_timer);
sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2606,6 +2630,8 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
nilfs_transaction_unlock(sci->sc_super);
+ flush_work(&sci->sc_iput_work);
+
} while (ret && retrycount-- > 0);
}
@@ -2630,6 +2656,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
|| sci->sc_seq_request != sci->sc_seq_done);
spin_unlock(&sci->sc_state_lock);
+ if (flush_work(&sci->sc_iput_work))
+ flag = true;
+
if (flag || !nilfs_segctor_confirm(sci))
nilfs_segctor_write_out(sci);
@@ -2639,6 +2668,12 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
}
+ if (!list_empty(&sci->sc_iput_queue)) {
+ nilfs_warning(sci->sc_super, __func__,
+ "iput queue is not empty\n");
+ nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
+ }
+
WARN_ON(!list_empty(&sci->sc_segbufs));
WARN_ON(!list_empty(&sci->sc_write_logs));
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 38a1d0013314..a48d6de1e02c 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -26,6 +26,7 @@
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
#include <linux/nilfs2_fs.h>
#include "nilfs.h"
@@ -92,6 +93,8 @@ struct nilfs_segsum_pointer {
* @sc_nblk_inc: Block count of current generation
* @sc_dirty_files: List of files to be written
* @sc_gc_inodes: List of GC inodes having blocks to be written
+ * @sc_iput_queue: list of inodes for which iput should be done
+ * @sc_iput_work: work struct to defer iput call
* @sc_freesegs: array of segment numbers to be freed
* @sc_nfreesegs: number of segments on @sc_freesegs
* @sc_dsync_inode: inode whose data pages are written for a sync operation
@@ -135,6 +138,8 @@ struct nilfs_sc_info {
struct list_head sc_dirty_files;
struct list_head sc_gc_inodes;
+ struct list_head sc_iput_queue;
+ struct work_struct sc_iput_work;
__u64 *sc_freesegs;
size_t sc_nfreesegs;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 287a22c04149..de6323eb0113 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -71,7 +71,7 @@ static int create_fd(struct fsnotify_group *group,
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- client_fd = get_unused_fd();
+ client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
if (client_fd < 0)
return client_fd;
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 238a5930cb3c..9d7e2b9659cb 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -42,7 +42,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
{
struct {
struct file_handle handle;
- u8 pad[64];
+ u8 pad[MAX_HANDLE_SZ];
} f;
int size, ret, i;
@@ -50,7 +50,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
size = f.handle.handle_bytes >> 2;
ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
- if ((ret == 255) || (ret == -ENOSPC)) {
+ if ((ret == FILEID_INVALID) || (ret < 0)) {
WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
return 0;
}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 74825be65b7b..fbb9dfb7b1d2 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -288,20 +288,25 @@ void fsnotify_unmount_inodes(struct list_head *list)
spin_unlock(&inode->i_lock);
/* In case the dropping of a reference would nuke next_i. */
- if ((&next_i->i_sb_list != list) &&
- atomic_read(&next_i->i_count)) {
+ while (&next_i->i_sb_list != list) {
spin_lock(&next_i->i_lock);
- if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
+ if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
+ atomic_read(&next_i->i_count)) {
__iget(next_i);
need_iput = next_i;
+ spin_unlock(&next_i->i_lock);
+ break;
}
spin_unlock(&next_i->i_lock);
+ next_i = list_entry(next_i->i_sb_list.next,
+ struct inode, i_sb_list);
}
/*
- * We can safely drop inode_sb_list_lock here because we hold
- * references on both inode and next_i. Also no new inodes
- * will be added since the umount has begun.
+ * We can safely drop inode_sb_list_lock here because either
+ * we actually hold references on both inode and next_i or
+ * end of list. Also no new inodes will be added since the
+ * umount has begun.
*/
spin_unlock(&inode_sb_list_lock);
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index a27e3fecefaf..250ed5b20c8f 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1748,7 +1748,6 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
if (page) {
set_page_dirty(page);
unlock_page(page);
- mark_page_accessed(page);
page_cache_release(page);
}
ntfs_debug("Done.");
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index db9bd8a31725..86ddab916b66 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2060,7 +2060,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
}
do {
unlock_page(pages[--do_pages]);
- mark_page_accessed(pages[do_pages]);
page_cache_release(pages[do_pages]);
} while (do_pages);
if (unlikely(status))
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index aeb44e879c51..bb6ee06118ca 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -899,7 +899,7 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
}
}
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
{
int i;
@@ -920,7 +920,11 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
page_cache_release(wc->w_target_page);
}
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
+}
+static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+{
+ ocfs2_unlock_pages(wc);
brelse(wc->w_di_bh);
kfree(wc);
}
@@ -2045,11 +2049,19 @@ out_write_size:
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
ocfs2_journal_dirty(handle, wc->w_di_bh);
+ /* unlock pages before dealloc since it needs acquiring j_trans_barrier
+ * lock, or it will cause a deadlock since journal commit threads holds
+ * this lock and will ask for the page lock when flushing the data.
+ * put it here to preserve the unlock order.
+ */
+ ocfs2_unlock_pages(wc);
+
ocfs2_commit_trans(osb, handle);
ocfs2_run_deallocs(osb, &wc->w_dealloc);
- ocfs2_free_write_ctxt(wc);
+ brelse(wc->w_di_bh);
+ kfree(wc);
return copied;
}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 91a7e85ac8fd..478e14d7be54 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2073,10 +2073,12 @@ struct ocfs2_empty_dir_priv {
unsigned seen_other;
unsigned dx_dir;
};
-static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
- loff_t pos, u64 ino, unsigned type)
+static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name,
+ int name_len, loff_t pos, u64 ino,
+ unsigned type)
{
- struct ocfs2_empty_dir_priv *p = priv;
+ struct ocfs2_empty_dir_priv *p =
+ container_of(ctx, struct ocfs2_empty_dir_priv, ctx);
/*
* Check the positions of "." and ".." records to be sure
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index af3f7aa73e13..1be3398c96f6 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -650,12 +650,9 @@ void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
clear_bit(bit, res->refmap);
}
-
-void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- assert_spin_locked(&res->spinlock);
-
res->inflight_locks++;
mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
@@ -663,6 +660,13 @@ void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
__builtin_return_address(0));
}
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ __dlm_lockres_grab_inflight_ref(dlm, res);
+}
+
void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
@@ -852,10 +856,8 @@ lookup:
/* finally add the lockres to its hash bucket */
__dlm_insert_lockres(dlm, res);
- /* Grab inflight ref to pin the resource */
- spin_lock(&res->spinlock);
- dlm_lockres_grab_inflight_ref(dlm, res);
- spin_unlock(&res->spinlock);
+ /* since this lockres is new it doesn't not require the spinlock */
+ __dlm_lockres_grab_inflight_ref(dlm, res);
/* get an extra ref on the mle in case this is a BLOCK
* if so, the creator of the BLOCK may try to put the last
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 51632c40e896..7fe30f655aa5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2391,10 +2391,14 @@ out_dio:
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
+ if (unlikely(written <= 0))
+ goto no_sync;
+
if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
((file->f_flags & O_DIRECT) && !direct_io)) {
- ret = filemap_fdatawrite_range(file->f_mapping, *ppos,
- *ppos + count - 1);
+ ret = filemap_fdatawrite_range(file->f_mapping,
+ iocb->ki_pos - written,
+ iocb->ki_pos - 1);
if (ret < 0)
written = ret;
@@ -2407,10 +2411,12 @@ out_dio:
}
if (!ret)
- ret = filemap_fdatawait_range(file->f_mapping, *ppos,
- *ppos + count - 1);
+ ret = filemap_fdatawait_range(file->f_mapping,
+ iocb->ki_pos - written,
+ iocb->ki_pos - 1);
}
+no_sync:
/*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
* function pointer which is called when o_direct io completes so that
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 44fc3e530c3d..a1b7dca075dd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1981,10 +1981,12 @@ struct ocfs2_orphan_filldir_priv {
struct ocfs2_super *osb;
};
-static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
- loff_t pos, u64 ino, unsigned type)
+static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
+ int name_len, loff_t pos, u64 ino,
+ unsigned type)
{
- struct ocfs2_orphan_filldir_priv *p = priv;
+ struct ocfs2_orphan_filldir_priv *p =
+ container_of(ctx, struct ocfs2_orphan_filldir_priv, ctx);
struct inode *iter;
if (name_len == 1 && !strncmp(".", name, 1))
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index feed025fe064..b2427623dd6c 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -94,6 +94,14 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
struct inode *inode,
const char *symname);
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+ struct buffer_head **bh1,
+ struct inode *inode1,
+ struct buffer_head **bh2,
+ struct inode *inode2,
+ int rename);
+
+static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
/* An orphan dir name is an 8 byte value, printed as a hex string */
#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
@@ -656,8 +664,10 @@ static int ocfs2_link(struct dentry *old_dentry,
{
handle_t *handle;
struct inode *inode = old_dentry->d_inode;
+ struct inode *old_dir = old_dentry->d_parent->d_inode;
int err;
struct buffer_head *fe_bh = NULL;
+ struct buffer_head *old_dir_bh = NULL;
struct buffer_head *parent_fe_bh = NULL;
struct ocfs2_dinode *fe = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -674,19 +684,33 @@ static int ocfs2_link(struct dentry *old_dentry,
dquot_initialize(dir);
- err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
+ err = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
+ &parent_fe_bh, dir, 0);
if (err < 0) {
if (err != -ENOENT)
mlog_errno(err);
return err;
}
+ /* make sure both dirs have bhs
+ * get an extra ref on old_dir_bh if old==new */
+ if (!parent_fe_bh) {
+ if (old_dir_bh) {
+ parent_fe_bh = old_dir_bh;
+ get_bh(parent_fe_bh);
+ } else {
+ mlog(ML_ERROR, "%s: no old_dir_bh!\n", osb->uuid_str);
+ err = -EIO;
+ goto out;
+ }
+ }
+
if (!dir->i_nlink) {
err = -ENOENT;
goto out;
}
- err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name,
+ err = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
old_dentry->d_name.len, &old_de_ino);
if (err) {
err = -ENOENT;
@@ -779,10 +803,11 @@ out_unlock_inode:
ocfs2_inode_unlock(inode, 1);
out:
- ocfs2_inode_unlock(dir, 1);
+ ocfs2_double_unlock(old_dir, dir);
brelse(fe_bh);
brelse(parent_fe_bh);
+ brelse(old_dir_bh);
ocfs2_free_dir_lookup_result(&lookup);
@@ -991,14 +1016,15 @@ leave:
}
/*
- * The only place this should be used is rename!
+ * The only place this should be used is rename and link!
* if they have the same id, then the 1st one is the only one locked.
*/
static int ocfs2_double_lock(struct ocfs2_super *osb,
struct buffer_head **bh1,
struct inode *inode1,
struct buffer_head **bh2,
- struct inode *inode2)
+ struct inode *inode2,
+ int rename)
{
int status;
struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
@@ -1028,7 +1054,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
}
/* lock id2 */
status = ocfs2_inode_lock_nested(inode2, bh2, 1,
- OI_LS_RENAME1);
+ rename == 1 ? OI_LS_RENAME1 : OI_LS_PARENT);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -1037,7 +1063,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
}
/* lock id1 */
- status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2);
+ status = ocfs2_inode_lock_nested(inode1, bh1, 1,
+ rename == 1 ? OI_LS_RENAME2 : OI_LS_PARENT);
if (status < 0) {
/*
* An error return must mean that no cluster locks
@@ -1137,7 +1164,7 @@ static int ocfs2_rename(struct inode *old_dir,
/* if old and new are the same, this'll just do one lock. */
status = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
- &new_dir_bh, new_dir);
+ &new_dir_bh, new_dir, 1);
if (status < 0) {
mlog_errno(status);
goto bail;
diff --git a/fs/open.c b/fs/open.c
index 2ed7325f713e..c92c6ef8bf6d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -822,8 +822,7 @@ struct file *dentry_open(const struct path *path, int flags,
f = get_empty_filp();
if (!IS_ERR(f)) {
f->f_flags = flags;
- f->f_path = *path;
- error = do_dentry_open(f, NULL, cred);
+ error = vfs_open(path, f, cred);
if (!error) {
/* from now on we need fput() to dispose of f */
error = open_check_o_direct(f);
@@ -840,6 +839,26 @@ struct file *dentry_open(const struct path *path, int flags,
}
EXPORT_SYMBOL(dentry_open);
+/**
+ * vfs_open - open the file at the given path
+ * @path: path to open
+ * @filp: newly allocated file with f_flag initialized
+ * @cred: credentials to use
+ */
+int vfs_open(const struct path *path, struct file *filp,
+ const struct cred *cred)
+{
+ struct inode *inode = path->dentry->d_inode;
+
+ if (inode->i_op->dentry_open)
+ return inode->i_op->dentry_open(path->dentry, filp, cred);
+ else {
+ filp->f_path = *path;
+ return do_dentry_open(filp, NULL, cred);
+ }
+}
+EXPORT_SYMBOL(vfs_open);
+
static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
{
int lookup_flags = 0;
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
new file mode 100644
index 000000000000..34355818a2e0
--- /dev/null
+++ b/fs/overlayfs/Kconfig
@@ -0,0 +1,10 @@
+config OVERLAY_FS
+ tristate "Overlay filesystem support"
+ help
+ An overlay filesystem combines two filesystems - an 'upper' filesystem
+ and a 'lower' filesystem. When a name exists in both filesystems, the
+ object in the 'upper' filesystem is visible while the object in the
+ 'lower' filesystem is either hidden or, in the case of directories,
+ merged with the 'upper' object.
+
+ For more information see Documentation/filesystems/overlayfs.txt
diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile
new file mode 100644
index 000000000000..900daed3e91d
--- /dev/null
+++ b/fs/overlayfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the overlay filesystem.
+#
+
+obj-$(CONFIG_OVERLAY_FS) += overlay.o
+
+overlay-objs := super.o inode.o dir.o readdir.o copy_up.o
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
new file mode 100644
index 000000000000..24f640441bd9
--- /dev/null
+++ b/fs/overlayfs/copy_up.c
@@ -0,0 +1,413 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/splice.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include "overlayfs.h"
+
+#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
+
+int ovl_copy_xattr(struct dentry *old, struct dentry *new)
+{
+ ssize_t list_size, size;
+ char *buf, *name, *value;
+ int error;
+
+ if (!old->d_inode->i_op->getxattr ||
+ !new->d_inode->i_op->getxattr)
+ return 0;
+
+ list_size = vfs_listxattr(old, NULL, 0);
+ if (list_size <= 0) {
+ if (list_size == -EOPNOTSUPP)
+ return 0;
+ return list_size;
+ }
+
+ buf = kzalloc(list_size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ error = -ENOMEM;
+ value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
+ if (!value)
+ goto out;
+
+ list_size = vfs_listxattr(old, buf, list_size);
+ if (list_size <= 0) {
+ error = list_size;
+ goto out_free_value;
+ }
+
+ for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
+ size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
+ if (size <= 0) {
+ error = size;
+ goto out_free_value;
+ }
+ error = vfs_setxattr(new, name, value, size, 0);
+ if (error)
+ goto out_free_value;
+ }
+
+out_free_value:
+ kfree(value);
+out:
+ kfree(buf);
+ return error;
+}
+
+static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
+{
+ struct file *old_file;
+ struct file *new_file;
+ loff_t old_pos = 0;
+ loff_t new_pos = 0;
+ int error = 0;
+
+ if (len == 0)
+ return 0;
+
+ old_file = ovl_path_open(old, O_RDONLY);
+ if (IS_ERR(old_file))
+ return PTR_ERR(old_file);
+
+ new_file = ovl_path_open(new, O_WRONLY);
+ if (IS_ERR(new_file)) {
+ error = PTR_ERR(new_file);
+ goto out_fput;
+ }
+
+ /* FIXME: copy up sparse files efficiently */
+ while (len) {
+ size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
+ long bytes;
+
+ if (len < this_len)
+ this_len = len;
+
+ if (signal_pending_state(TASK_KILLABLE, current)) {
+ error = -EINTR;
+ break;
+ }
+
+ bytes = do_splice_direct(old_file, &old_pos,
+ new_file, &new_pos,
+ this_len, SPLICE_F_MOVE);
+ if (bytes <= 0) {
+ error = bytes;
+ break;
+ }
+ WARN_ON(old_pos != new_pos);
+
+ len -= bytes;
+ }
+
+ fput(new_file);
+out_fput:
+ fput(old_file);
+ return error;
+}
+
+static char *ovl_read_symlink(struct dentry *realdentry)
+{
+ int res;
+ char *buf;
+ struct inode *inode = realdentry->d_inode;
+ mm_segment_t old_fs;
+
+ res = -EINVAL;
+ if (!inode->i_op->readlink)
+ goto err;
+
+ res = -ENOMEM;
+ buf = (char *) __get_free_page(GFP_KERNEL);
+ if (!buf)
+ goto err;
+
+ old_fs = get_fs();
+ set_fs(get_ds());
+ /* The cast to a user pointer is valid due to the set_fs() */
+ res = inode->i_op->readlink(realdentry,
+ (char __user *)buf, PAGE_SIZE - 1);
+ set_fs(old_fs);
+ if (res < 0) {
+ free_page((unsigned long) buf);
+ goto err;
+ }
+ buf[res] = '\0';
+
+ return buf;
+
+err:
+ return ERR_PTR(res);
+}
+
+static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
+{
+ struct iattr attr = {
+ .ia_valid =
+ ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
+ .ia_atime = stat->atime,
+ .ia_mtime = stat->mtime,
+ };
+
+ return notify_change(upperdentry, &attr, NULL);
+}
+
+int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
+{
+ int err = 0;
+
+ if (!S_ISLNK(stat->mode)) {
+ struct iattr attr = {
+ .ia_valid = ATTR_MODE,
+ .ia_mode = stat->mode,
+ };
+ err = notify_change(upperdentry, &attr, NULL);
+ }
+ if (!err) {
+ struct iattr attr = {
+ .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = stat->uid,
+ .ia_gid = stat->gid,
+ };
+ err = notify_change(upperdentry, &attr, NULL);
+ }
+ if (!err)
+ ovl_set_timestamps(upperdentry, stat);
+
+ return err;
+}
+
+static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
+ struct dentry *dentry, struct path *lowerpath,
+ struct kstat *stat, struct iattr *attr,
+ const char *link)
+{
+ struct inode *wdir = workdir->d_inode;
+ struct inode *udir = upperdir->d_inode;
+ struct dentry *newdentry = NULL;
+ struct dentry *upper = NULL;
+ umode_t mode = stat->mode;
+ int err;
+
+ newdentry = ovl_lookup_temp(workdir, dentry);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out;
+
+ upper = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
+ goto out1;
+
+ /* Can't properly set mode on creation because of the umask */
+ stat->mode &= S_IFMT;
+ err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
+ stat->mode = mode;
+ if (err)
+ goto out2;
+
+ if (S_ISREG(stat->mode)) {
+ struct path upperpath;
+ ovl_path_upper(dentry, &upperpath);
+ BUG_ON(upperpath.dentry != NULL);
+ upperpath.dentry = newdentry;
+
+ err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
+ if (err)
+ goto out_cleanup;
+ }
+
+ err = ovl_copy_xattr(lowerpath->dentry, newdentry);
+ if (err)
+ goto out_cleanup;
+
+ mutex_lock(&newdentry->d_inode->i_mutex);
+ err = ovl_set_attr(newdentry, stat);
+ if (!err && attr)
+ err = notify_change(newdentry, attr, NULL);
+ mutex_unlock(&newdentry->d_inode->i_mutex);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
+ if (err)
+ goto out_cleanup;
+
+ ovl_dentry_update(dentry, newdentry);
+ newdentry = NULL;
+
+ /*
+ * Non-directores become opaque when copied up.
+ */
+ if (!S_ISDIR(stat->mode))
+ ovl_dentry_set_opaque(dentry, true);
+out2:
+ dput(upper);
+out1:
+ dput(newdentry);
+out:
+ return err;
+
+out_cleanup:
+ ovl_cleanup(wdir, newdentry);
+ goto out;
+}
+
+/*
+ * Copy up a single dentry
+ *
+ * Directory renames only allowed on "pure upper" (already created on
+ * upper filesystem, never copied up). Directories which are on lower or
+ * are merged may not be renamed. For these -EXDEV is returned and
+ * userspace has to deal with it. This means, when copying up a
+ * directory we can rely on it and ancestors being stable.
+ *
+ * Non-directory renames start with copy up of source if necessary. The
+ * actual rename will only proceed once the copy up was successful. Copy
+ * up uses upper parent i_mutex for exclusion. Since rename can change
+ * d_parent it is possible that the copy up will lock the old parent. At
+ * that point the file will have already been copied up anyway.
+ */
+int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
+ struct path *lowerpath, struct kstat *stat,
+ struct iattr *attr)
+{
+ struct dentry *workdir = ovl_workdir(dentry);
+ int err;
+ struct kstat pstat;
+ struct path parentpath;
+ struct dentry *upperdir;
+ struct dentry *upperdentry;
+ const struct cred *old_cred;
+ struct cred *override_cred;
+ char *link = NULL;
+
+ ovl_path_upper(parent, &parentpath);
+ upperdir = parentpath.dentry;
+
+ err = vfs_getattr(&parentpath, &pstat);
+ if (err)
+ return err;
+
+ if (S_ISLNK(stat->mode)) {
+ link = ovl_read_symlink(lowerpath->dentry);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+ }
+
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_free_link;
+
+ override_cred->fsuid = stat->uid;
+ override_cred->fsgid = stat->gid;
+ /*
+ * CAP_SYS_ADMIN for copying up extended attributes
+ * CAP_DAC_OVERRIDE for create
+ * CAP_FOWNER for chmod, timestamp update
+ * CAP_FSETID for chmod
+ * CAP_CHOWN for chown
+ * CAP_MKNOD for mknod
+ */
+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
+ cap_raise(override_cred->cap_effective, CAP_FSETID);
+ cap_raise(override_cred->cap_effective, CAP_CHOWN);
+ cap_raise(override_cred->cap_effective, CAP_MKNOD);
+ old_cred = override_creds(override_cred);
+
+ err = -EIO;
+ if (lock_rename(workdir, upperdir) != NULL) {
+ pr_err("overlayfs: failed to lock workdir+upperdir\n");
+ goto out_unlock;
+ }
+ upperdentry = ovl_dentry_upper(dentry);
+ if (upperdentry) {
+ unlock_rename(workdir, upperdir);
+ err = 0;
+ /* Raced with another copy-up? Do the setattr here */
+ if (attr) {
+ mutex_lock(&upperdentry->d_inode->i_mutex);
+ err = notify_change(upperdentry, attr, NULL);
+ mutex_unlock(&upperdentry->d_inode->i_mutex);
+ }
+ goto out_put_cred;
+ }
+
+ err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
+ stat, attr, link);
+ if (!err) {
+ /* Restore timestamps on parent (best effort) */
+ ovl_set_timestamps(upperdir, &pstat);
+ }
+out_unlock:
+ unlock_rename(workdir, upperdir);
+out_put_cred:
+ revert_creds(old_cred);
+ put_cred(override_cred);
+
+out_free_link:
+ if (link)
+ free_page((unsigned long) link);
+
+ return err;
+}
+
+int ovl_copy_up(struct dentry *dentry)
+{
+ int err;
+
+ err = 0;
+ while (!err) {
+ struct dentry *next;
+ struct dentry *parent;
+ struct path lowerpath;
+ struct kstat stat;
+ enum ovl_path_type type = ovl_path_type(dentry);
+
+ if (OVL_TYPE_UPPER(type))
+ break;
+
+ next = dget(dentry);
+ /* find the topmost dentry not yet copied up */
+ for (;;) {
+ parent = dget_parent(next);
+
+ type = ovl_path_type(parent);
+ if (OVL_TYPE_UPPER(type))
+ break;
+
+ dput(next);
+ next = parent;
+ }
+
+ ovl_path_lower(next, &lowerpath);
+ err = vfs_getattr(&lowerpath, &stat);
+ if (!err)
+ err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
+
+ dput(parent);
+ dput(next);
+ }
+
+ return err;
+}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
new file mode 100644
index 000000000000..0dc4c33a0a1b
--- /dev/null
+++ b/fs/overlayfs/dir.c
@@ -0,0 +1,928 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include "overlayfs.h"
+
+void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
+{
+ int err;
+
+ dget(wdentry);
+ if (S_ISDIR(wdentry->d_inode->i_mode))
+ err = ovl_do_rmdir(wdir, wdentry);
+ else
+ err = ovl_do_unlink(wdir, wdentry);
+ dput(wdentry);
+
+ if (err) {
+ pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
+ wdentry, err);
+ }
+}
+
+struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
+{
+ struct dentry *temp;
+ char name[20];
+
+ snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
+
+ temp = lookup_one_len(name, workdir, strlen(name));
+ if (!IS_ERR(temp) && temp->d_inode) {
+ pr_err("overlayfs: workdir/%s already exists\n", name);
+ dput(temp);
+ temp = ERR_PTR(-EIO);
+ }
+
+ return temp;
+}
+
+/* caller holds i_mutex on workdir */
+static struct dentry *ovl_whiteout(struct dentry *workdir,
+ struct dentry *dentry)
+{
+ int err;
+ struct dentry *whiteout;
+ struct inode *wdir = workdir->d_inode;
+
+ whiteout = ovl_lookup_temp(workdir, dentry);
+ if (IS_ERR(whiteout))
+ return whiteout;
+
+ err = ovl_do_whiteout(wdir, whiteout);
+ if (err) {
+ dput(whiteout);
+ whiteout = ERR_PTR(err);
+ }
+
+ return whiteout;
+}
+
+int ovl_create_real(struct inode *dir, struct dentry *newdentry,
+ struct kstat *stat, const char *link,
+ struct dentry *hardlink, bool debug)
+{
+ int err;
+
+ if (newdentry->d_inode)
+ return -ESTALE;
+
+ if (hardlink) {
+ err = ovl_do_link(hardlink, dir, newdentry, debug);
+ } else {
+ switch (stat->mode & S_IFMT) {
+ case S_IFREG:
+ err = ovl_do_create(dir, newdentry, stat->mode, debug);
+ break;
+
+ case S_IFDIR:
+ err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
+ break;
+
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
+ err = ovl_do_mknod(dir, newdentry,
+ stat->mode, stat->rdev, debug);
+ break;
+
+ case S_IFLNK:
+ err = ovl_do_symlink(dir, newdentry, link, debug);
+ break;
+
+ default:
+ err = -EPERM;
+ }
+ }
+ if (!err && WARN_ON(!newdentry->d_inode)) {
+ /*
+ * Not quite sure if non-instantiated dentry is legal or not.
+ * VFS doesn't seem to care so check and warn here.
+ */
+ err = -ENOENT;
+ }
+ return err;
+}
+
+static int ovl_set_opaque(struct dentry *upperdentry)
+{
+ return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
+}
+
+static void ovl_remove_opaque(struct dentry *upperdentry)
+{
+ int err;
+
+ err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
+ if (err) {
+ pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
+ upperdentry->d_name.name, err);
+ }
+}
+
+static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ int err;
+ enum ovl_path_type type;
+ struct path realpath;
+
+ type = ovl_path_real(dentry, &realpath);
+ err = vfs_getattr(&realpath, stat);
+ if (err)
+ return err;
+
+ stat->dev = dentry->d_sb->s_dev;
+ stat->ino = dentry->d_inode->i_ino;
+
+ /*
+ * It's probably not worth it to count subdirs to get the
+ * correct link count. nlink=1 seems to pacify 'find' and
+ * other utilities.
+ */
+ if (OVL_TYPE_MERGE(type))
+ stat->nlink = 1;
+
+ return 0;
+}
+
+static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
+ struct kstat *stat, const char *link,
+ struct dentry *hardlink)
+{
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *udir = upperdir->d_inode;
+ struct dentry *newdentry;
+ int err;
+
+ mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
+ newdentry = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out_unlock;
+ err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
+ if (err)
+ goto out_dput;
+
+ ovl_dentry_version_inc(dentry->d_parent);
+ ovl_dentry_update(dentry, newdentry);
+ ovl_copyattr(newdentry->d_inode, inode);
+ d_instantiate(dentry, inode);
+ newdentry = NULL;
+out_dput:
+ dput(newdentry);
+out_unlock:
+ mutex_unlock(&udir->i_mutex);
+ return err;
+}
+
+static int ovl_lock_rename_workdir(struct dentry *workdir,
+ struct dentry *upperdir)
+{
+ /* Workdir should not be the same as upperdir */
+ if (workdir == upperdir)
+ goto err;
+
+ /* Workdir should not be subdir of upperdir and vice versa */
+ if (lock_rename(workdir, upperdir) != NULL)
+ goto err_unlock;
+
+ return 0;
+
+err_unlock:
+ unlock_rename(workdir, upperdir);
+err:
+ pr_err("overlayfs: failed to lock workdir+upperdir\n");
+ return -EIO;
+}
+
+static struct dentry *ovl_clear_empty(struct dentry *dentry,
+ struct list_head *list)
+{
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *wdir = workdir->d_inode;
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *udir = upperdir->d_inode;
+ struct path upperpath;
+ struct dentry *upper;
+ struct dentry *opaquedir;
+ struct kstat stat;
+ int err;
+
+ err = ovl_lock_rename_workdir(workdir, upperdir);
+ if (err)
+ goto out;
+
+ ovl_path_upper(dentry, &upperpath);
+ err = vfs_getattr(&upperpath, &stat);
+ if (err)
+ goto out_unlock;
+
+ err = -ESTALE;
+ if (!S_ISDIR(stat.mode))
+ goto out_unlock;
+ upper = upperpath.dentry;
+ if (upper->d_parent->d_inode != udir)
+ goto out_unlock;
+
+ opaquedir = ovl_lookup_temp(workdir, dentry);
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir))
+ goto out_unlock;
+
+ err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
+ if (err)
+ goto out_dput;
+
+ err = ovl_copy_xattr(upper, opaquedir);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_set_opaque(opaquedir);
+ if (err)
+ goto out_cleanup;
+
+ mutex_lock(&opaquedir->d_inode->i_mutex);
+ err = ovl_set_attr(opaquedir, &stat);
+ mutex_unlock(&opaquedir->d_inode->i_mutex);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
+ if (err)
+ goto out_cleanup;
+
+ ovl_cleanup_whiteouts(upper, list);
+ ovl_cleanup(wdir, upper);
+ unlock_rename(workdir, upperdir);
+
+ /* dentry's upper doesn't match now, get rid of it */
+ d_drop(dentry);
+
+ return opaquedir;
+
+out_cleanup:
+ ovl_cleanup(wdir, opaquedir);
+out_dput:
+ dput(opaquedir);
+out_unlock:
+ unlock_rename(workdir, upperdir);
+out:
+ return ERR_PTR(err);
+}
+
+static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
+{
+ int err;
+ struct dentry *ret = NULL;
+ LIST_HEAD(list);
+
+ err = ovl_check_empty_dir(dentry, &list);
+ if (err)
+ ret = ERR_PTR(err);
+ else {
+ /*
+ * If no upperdentry then skip clearing whiteouts.
+ *
+ * Can race with copy-up, since we don't hold the upperdir
+ * mutex. Doesn't matter, since copy-up can't create a
+ * non-empty directory from an empty one.
+ */
+ if (ovl_dentry_upper(dentry))
+ ret = ovl_clear_empty(dentry, &list);
+ }
+
+ ovl_cache_free(&list);
+
+ return ret;
+}
+
+static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
+ struct kstat *stat, const char *link,
+ struct dentry *hardlink)
+{
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *wdir = workdir->d_inode;
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *udir = upperdir->d_inode;
+ struct dentry *upper;
+ struct dentry *newdentry;
+ int err;
+
+ err = ovl_lock_rename_workdir(workdir, upperdir);
+ if (err)
+ goto out;
+
+ newdentry = ovl_lookup_temp(workdir, dentry);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out_unlock;
+
+ upper = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
+ goto out_dput;
+
+ err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
+ if (err)
+ goto out_dput2;
+
+ if (S_ISDIR(stat->mode)) {
+ err = ovl_set_opaque(newdentry);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_do_rename(wdir, newdentry, udir, upper,
+ RENAME_EXCHANGE);
+ if (err)
+ goto out_cleanup;
+
+ ovl_cleanup(wdir, upper);
+ } else {
+ err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
+ if (err)
+ goto out_cleanup;
+ }
+ ovl_dentry_version_inc(dentry->d_parent);
+ ovl_dentry_update(dentry, newdentry);
+ ovl_copyattr(newdentry->d_inode, inode);
+ d_instantiate(dentry, inode);
+ newdentry = NULL;
+out_dput2:
+ dput(upper);
+out_dput:
+ dput(newdentry);
+out_unlock:
+ unlock_rename(workdir, upperdir);
+out:
+ return err;
+
+out_cleanup:
+ ovl_cleanup(wdir, newdentry);
+ goto out_dput2;
+}
+
+static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
+ const char *link, struct dentry *hardlink)
+{
+ int err;
+ struct inode *inode;
+ struct kstat stat = {
+ .mode = mode,
+ .rdev = rdev,
+ };
+
+ err = -ENOMEM;
+ inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
+ if (!inode)
+ goto out;
+
+ err = ovl_copy_up(dentry->d_parent);
+ if (err)
+ goto out_iput;
+
+ if (!ovl_dentry_is_opaque(dentry)) {
+ err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
+ } else {
+ const struct cred *old_cred;
+ struct cred *override_cred;
+
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_iput;
+
+ /*
+ * CAP_SYS_ADMIN for setting opaque xattr
+ * CAP_DAC_OVERRIDE for create in workdir, rename
+ * CAP_FOWNER for removing whiteout from sticky dir
+ */
+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
+ old_cred = override_creds(override_cred);
+
+ err = ovl_create_over_whiteout(dentry, inode, &stat, link,
+ hardlink);
+
+ revert_creds(old_cred);
+ put_cred(override_cred);
+ }
+
+ if (!err)
+ inode = NULL;
+out_iput:
+ iput(inode);
+out:
+ return err;
+}
+
+static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
+ const char *link)
+{
+ int err;
+
+ err = ovl_want_write(dentry);
+ if (!err) {
+ err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
+ ovl_drop_write(dentry);
+ }
+
+ return err;
+}
+
+static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool excl)
+{
+ return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
+}
+
+static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
+}
+
+static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+ dev_t rdev)
+{
+ /* Don't allow creation of "whiteout" on overlay */
+ if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
+ return -EPERM;
+
+ return ovl_create_object(dentry, mode, rdev, NULL);
+}
+
+static int ovl_symlink(struct inode *dir, struct dentry *dentry,
+ const char *link)
+{
+ return ovl_create_object(dentry, S_IFLNK, 0, link);
+}
+
+static int ovl_link(struct dentry *old, struct inode *newdir,
+ struct dentry *new)
+{
+ int err;
+ struct dentry *upper;
+
+ err = ovl_want_write(old);
+ if (err)
+ goto out;
+
+ err = ovl_copy_up(old);
+ if (err)
+ goto out_drop_write;
+
+ upper = ovl_dentry_upper(old);
+ err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
+
+out_drop_write:
+ ovl_drop_write(old);
+out:
+ return err;
+}
+
+static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
+{
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *wdir = workdir->d_inode;
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *udir = upperdir->d_inode;
+ struct dentry *whiteout;
+ struct dentry *upper;
+ struct dentry *opaquedir = NULL;
+ int err;
+
+ if (is_dir && OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
+ opaquedir = ovl_check_empty_and_clear(dentry);
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir))
+ goto out;
+ }
+
+ err = ovl_lock_rename_workdir(workdir, upperdir);
+ if (err)
+ goto out_dput;
+
+ whiteout = ovl_whiteout(workdir, dentry);
+ err = PTR_ERR(whiteout);
+ if (IS_ERR(whiteout))
+ goto out_unlock;
+
+ upper = ovl_dentry_upper(dentry);
+ if (!upper) {
+ upper = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
+ goto kill_whiteout;
+
+ err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
+ dput(upper);
+ if (err)
+ goto kill_whiteout;
+ } else {
+ int flags = 0;
+
+ if (opaquedir)
+ upper = opaquedir;
+ err = -ESTALE;
+ if (upper->d_parent != upperdir)
+ goto kill_whiteout;
+
+ if (is_dir)
+ flags |= RENAME_EXCHANGE;
+
+ err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
+ if (err)
+ goto kill_whiteout;
+
+ if (is_dir)
+ ovl_cleanup(wdir, upper);
+ }
+ ovl_dentry_version_inc(dentry->d_parent);
+out_d_drop:
+ d_drop(dentry);
+ dput(whiteout);
+out_unlock:
+ unlock_rename(workdir, upperdir);
+out_dput:
+ dput(opaquedir);
+out:
+ return err;
+
+kill_whiteout:
+ ovl_cleanup(wdir, whiteout);
+ goto out_d_drop;
+}
+
+static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
+{
+ struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct inode *dir = upperdir->d_inode;
+ struct dentry *upper = ovl_dentry_upper(dentry);
+ int err;
+
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ err = -ESTALE;
+ if (upper->d_parent == upperdir) {
+ /* Don't let d_delete() think it can reset d_inode */
+ dget(upper);
+ if (is_dir)
+ err = vfs_rmdir(dir, upper);
+ else
+ err = vfs_unlink(dir, upper, NULL);
+ dput(upper);
+ ovl_dentry_version_inc(dentry->d_parent);
+ }
+
+ /*
+ * Keeping this dentry hashed would mean having to release
+ * upperpath/lowerpath, which could only be done if we are the
+ * sole user of this dentry. Too tricky... Just unhash for
+ * now.
+ */
+ d_drop(dentry);
+ mutex_unlock(&dir->i_mutex);
+
+ return err;
+}
+
+static inline int ovl_check_sticky(struct dentry *dentry)
+{
+ struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
+ struct inode *inode = ovl_dentry_real(dentry)->d_inode;
+
+ if (check_sticky(dir, inode))
+ return -EPERM;
+
+ return 0;
+}
+
+static int ovl_do_remove(struct dentry *dentry, bool is_dir)
+{
+ enum ovl_path_type type;
+ int err;
+
+ err = ovl_check_sticky(dentry);
+ if (err)
+ goto out;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ err = ovl_copy_up(dentry->d_parent);
+ if (err)
+ goto out_drop_write;
+
+ type = ovl_path_type(dentry);
+ if (OVL_TYPE_PURE_UPPER(type)) {
+ err = ovl_remove_upper(dentry, is_dir);
+ } else {
+ const struct cred *old_cred;
+ struct cred *override_cred;
+
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_drop_write;
+
+ /*
+ * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
+ * CAP_DAC_OVERRIDE for create in workdir, rename
+ * CAP_FOWNER for removing whiteout from sticky dir
+ * CAP_FSETID for chmod of opaque dir
+ * CAP_CHOWN for chown of opaque dir
+ */
+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
+ cap_raise(override_cred->cap_effective, CAP_FSETID);
+ cap_raise(override_cred->cap_effective, CAP_CHOWN);
+ old_cred = override_creds(override_cred);
+
+ err = ovl_remove_and_whiteout(dentry, is_dir);
+
+ revert_creds(old_cred);
+ put_cred(override_cred);
+ }
+out_drop_write:
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+static int ovl_unlink(struct inode *dir, struct dentry *dentry)
+{
+ return ovl_do_remove(dentry, false);
+}
+
+static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ return ovl_do_remove(dentry, true);
+}
+
+static int ovl_rename2(struct inode *olddir, struct dentry *old,
+ struct inode *newdir, struct dentry *new,
+ unsigned int flags)
+{
+ int err;
+ enum ovl_path_type old_type;
+ enum ovl_path_type new_type;
+ struct dentry *old_upperdir;
+ struct dentry *new_upperdir;
+ struct dentry *olddentry;
+ struct dentry *newdentry;
+ struct dentry *trap;
+ bool old_opaque;
+ bool new_opaque;
+ bool new_create = false;
+ bool cleanup_whiteout = false;
+ bool overwrite = !(flags & RENAME_EXCHANGE);
+ bool is_dir = S_ISDIR(old->d_inode->i_mode);
+ bool new_is_dir = false;
+ struct dentry *opaquedir = NULL;
+ const struct cred *old_cred = NULL;
+ struct cred *override_cred = NULL;
+
+ err = -EINVAL;
+ if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
+ goto out;
+
+ flags &= ~RENAME_NOREPLACE;
+
+ err = ovl_check_sticky(old);
+ if (err)
+ goto out;
+
+ /* Don't copy up directory trees */
+ old_type = ovl_path_type(old);
+ err = -EXDEV;
+ if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
+ goto out;
+
+ if (new->d_inode) {
+ err = ovl_check_sticky(new);
+ if (err)
+ goto out;
+
+ if (S_ISDIR(new->d_inode->i_mode))
+ new_is_dir = true;
+
+ new_type = ovl_path_type(new);
+ err = -EXDEV;
+ if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
+ goto out;
+
+ err = 0;
+ if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
+ if (ovl_dentry_lower(old)->d_inode ==
+ ovl_dentry_lower(new)->d_inode)
+ goto out;
+ }
+ if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
+ if (ovl_dentry_upper(old)->d_inode ==
+ ovl_dentry_upper(new)->d_inode)
+ goto out;
+ }
+ } else {
+ if (ovl_dentry_is_opaque(new))
+ new_type = __OVL_PATH_UPPER;
+ else
+ new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
+ }
+
+ err = ovl_want_write(old);
+ if (err)
+ goto out;
+
+ err = ovl_copy_up(old);
+ if (err)
+ goto out_drop_write;
+
+ err = ovl_copy_up(new->d_parent);
+ if (err)
+ goto out_drop_write;
+ if (!overwrite) {
+ err = ovl_copy_up(new);
+ if (err)
+ goto out_drop_write;
+ }
+
+ old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
+ new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
+
+ if (old_opaque || new_opaque) {
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_drop_write;
+
+ /*
+ * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
+ * CAP_DAC_OVERRIDE for create in workdir
+ * CAP_FOWNER for removing whiteout from sticky dir
+ * CAP_FSETID for chmod of opaque dir
+ * CAP_CHOWN for chown of opaque dir
+ */
+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
+ cap_raise(override_cred->cap_effective, CAP_FSETID);
+ cap_raise(override_cred->cap_effective, CAP_CHOWN);
+ old_cred = override_creds(override_cred);
+ }
+
+ if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
+ opaquedir = ovl_check_empty_and_clear(new);
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir)) {
+ opaquedir = NULL;
+ goto out_revert_creds;
+ }
+ }
+
+ if (overwrite) {
+ if (old_opaque) {
+ if (new->d_inode || !new_opaque) {
+ /* Whiteout source */
+ flags |= RENAME_WHITEOUT;
+ } else {
+ /* Switch whiteouts */
+ flags |= RENAME_EXCHANGE;
+ }
+ } else if (is_dir && !new->d_inode && new_opaque) {
+ flags |= RENAME_EXCHANGE;
+ cleanup_whiteout = true;
+ }
+ }
+
+ old_upperdir = ovl_dentry_upper(old->d_parent);
+ new_upperdir = ovl_dentry_upper(new->d_parent);
+
+ trap = lock_rename(new_upperdir, old_upperdir);
+
+ olddentry = ovl_dentry_upper(old);
+ newdentry = ovl_dentry_upper(new);
+ if (newdentry) {
+ if (opaquedir) {
+ newdentry = opaquedir;
+ opaquedir = NULL;
+ } else {
+ dget(newdentry);
+ }
+ } else {
+ new_create = true;
+ newdentry = lookup_one_len(new->d_name.name, new_upperdir,
+ new->d_name.len);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out_unlock;
+ }
+
+ err = -ESTALE;
+ if (olddentry->d_parent != old_upperdir)
+ goto out_dput;
+ if (newdentry->d_parent != new_upperdir)
+ goto out_dput;
+ if (olddentry == trap)
+ goto out_dput;
+ if (newdentry == trap)
+ goto out_dput;
+
+ if (is_dir && !old_opaque && new_opaque) {
+ err = ovl_set_opaque(olddentry);
+ if (err)
+ goto out_dput;
+ }
+ if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
+ err = ovl_set_opaque(newdentry);
+ if (err)
+ goto out_dput;
+ }
+
+ if (old_opaque || new_opaque) {
+ err = ovl_do_rename(old_upperdir->d_inode, olddentry,
+ new_upperdir->d_inode, newdentry,
+ flags);
+ } else {
+ /* No debug for the plain case */
+ BUG_ON(flags & ~RENAME_EXCHANGE);
+ err = vfs_rename(old_upperdir->d_inode, olddentry,
+ new_upperdir->d_inode, newdentry,
+ NULL, flags);
+ }
+
+ if (err) {
+ if (is_dir && !old_opaque && new_opaque)
+ ovl_remove_opaque(olddentry);
+ if (!overwrite && new_is_dir && old_opaque && !new_opaque)
+ ovl_remove_opaque(newdentry);
+ goto out_dput;
+ }
+
+ if (is_dir && old_opaque && !new_opaque)
+ ovl_remove_opaque(olddentry);
+ if (!overwrite && new_is_dir && !old_opaque && new_opaque)
+ ovl_remove_opaque(newdentry);
+
+ if (old_opaque != new_opaque) {
+ ovl_dentry_set_opaque(old, new_opaque);
+ if (!overwrite)
+ ovl_dentry_set_opaque(new, old_opaque);
+ }
+
+ if (cleanup_whiteout)
+ ovl_cleanup(old_upperdir->d_inode, newdentry);
+
+ ovl_dentry_version_inc(old->d_parent);
+ ovl_dentry_version_inc(new->d_parent);
+
+out_dput:
+ dput(newdentry);
+out_unlock:
+ unlock_rename(new_upperdir, old_upperdir);
+out_revert_creds:
+ if (old_opaque || new_opaque) {
+ revert_creds(old_cred);
+ put_cred(override_cred);
+ }
+out_drop_write:
+ ovl_drop_write(old);
+out:
+ dput(opaquedir);
+ return err;
+}
+
+const struct inode_operations ovl_dir_inode_operations = {
+ .lookup = ovl_lookup,
+ .mkdir = ovl_mkdir,
+ .symlink = ovl_symlink,
+ .unlink = ovl_unlink,
+ .rmdir = ovl_rmdir,
+ .rename2 = ovl_rename2,
+ .link = ovl_link,
+ .setattr = ovl_setattr,
+ .create = ovl_create,
+ .mknod = ovl_mknod,
+ .permission = ovl_permission,
+ .getattr = ovl_dir_getattr,
+ .setxattr = ovl_setxattr,
+ .getxattr = ovl_getxattr,
+ .listxattr = ovl_listxattr,
+ .removexattr = ovl_removexattr,
+};
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
new file mode 100644
index 000000000000..04f124884687
--- /dev/null
+++ b/fs/overlayfs/inode.c
@@ -0,0 +1,436 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include "overlayfs.h"
+
+static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
+ bool no_data)
+{
+ int err;
+ struct dentry *parent;
+ struct kstat stat;
+ struct path lowerpath;
+
+ parent = dget_parent(dentry);
+ err = ovl_copy_up(parent);
+ if (err)
+ goto out_dput_parent;
+
+ ovl_path_lower(dentry, &lowerpath);
+ err = vfs_getattr(&lowerpath, &stat);
+ if (err)
+ goto out_dput_parent;
+
+ if (no_data)
+ stat.size = 0;
+
+ err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
+
+out_dput_parent:
+ dput(parent);
+ return err;
+}
+
+int ovl_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ int err;
+ struct dentry *upperdentry;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ upperdentry = ovl_dentry_upper(dentry);
+ if (upperdentry) {
+ mutex_lock(&upperdentry->d_inode->i_mutex);
+ err = notify_change(upperdentry, attr, NULL);
+ mutex_unlock(&upperdentry->d_inode->i_mutex);
+ } else {
+ err = ovl_copy_up_last(dentry, attr, false);
+ }
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct path realpath;
+
+ ovl_path_real(dentry, &realpath);
+ return vfs_getattr(&realpath, stat);
+}
+
+int ovl_permission(struct inode *inode, int mask)
+{
+ struct ovl_entry *oe;
+ struct dentry *alias = NULL;
+ struct inode *realinode;
+ struct dentry *realdentry;
+ bool is_upper;
+ int err;
+
+ if (S_ISDIR(inode->i_mode)) {
+ oe = inode->i_private;
+ } else if (mask & MAY_NOT_BLOCK) {
+ return -ECHILD;
+ } else {
+ /*
+ * For non-directories find an alias and get the info
+ * from there.
+ */
+ alias = d_find_any_alias(inode);
+ if (WARN_ON(!alias))
+ return -ENOENT;
+
+ oe = alias->d_fsdata;
+ }
+
+ realdentry = ovl_entry_real(oe, &is_upper);
+
+ /* Careful in RCU walk mode */
+ realinode = ACCESS_ONCE(realdentry->d_inode);
+ if (!realinode) {
+ WARN_ON(!(mask & MAY_NOT_BLOCK));
+ err = -ENOENT;
+ goto out_dput;
+ }
+
+ if (mask & MAY_WRITE) {
+ umode_t mode = realinode->i_mode;
+
+ /*
+ * Writes will always be redirected to upper layer, so
+ * ignore lower layer being read-only.
+ *
+ * If the overlay itself is read-only then proceed
+ * with the permission check, don't return EROFS.
+ * This will only happen if this is the lower layer of
+ * another overlayfs.
+ *
+ * If upper fs becomes read-only after the overlay was
+ * constructed return EROFS to prevent modification of
+ * upper layer.
+ */
+ err = -EROFS;
+ if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+ goto out_dput;
+ }
+
+ err = __inode_permission(realinode, mask);
+out_dput:
+ dput(alias);
+ return err;
+}
+
+
+struct ovl_link_data {
+ struct dentry *realdentry;
+ void *cookie;
+};
+
+static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ void *ret;
+ struct dentry *realdentry;
+ struct inode *realinode;
+
+ realdentry = ovl_dentry_real(dentry);
+ realinode = realdentry->d_inode;
+
+ if (WARN_ON(!realinode->i_op->follow_link))
+ return ERR_PTR(-EPERM);
+
+ ret = realinode->i_op->follow_link(realdentry, nd);
+ if (IS_ERR(ret))
+ return ret;
+
+ if (realinode->i_op->put_link) {
+ struct ovl_link_data *data;
+
+ data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
+ if (!data) {
+ realinode->i_op->put_link(realdentry, nd, ret);
+ return ERR_PTR(-ENOMEM);
+ }
+ data->realdentry = realdentry;
+ data->cookie = ret;
+
+ return data;
+ } else {
+ return NULL;
+ }
+}
+
+static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
+{
+ struct inode *realinode;
+ struct ovl_link_data *data = c;
+
+ if (!data)
+ return;
+
+ realinode = data->realdentry->d_inode;
+ realinode->i_op->put_link(data->realdentry, nd, data->cookie);
+ kfree(data);
+}
+
+static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+ struct path realpath;
+ struct inode *realinode;
+
+ ovl_path_real(dentry, &realpath);
+ realinode = realpath.dentry->d_inode;
+
+ if (!realinode->i_op->readlink)
+ return -EINVAL;
+
+ touch_atime(&realpath);
+
+ return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
+}
+
+
+static bool ovl_is_private_xattr(const char *name)
+{
+ return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
+}
+
+int ovl_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ int err;
+ struct dentry *upperdentry;
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ err = -EPERM;
+ if (ovl_is_private_xattr(name))
+ goto out_drop_write;
+
+ err = ovl_copy_up(dentry);
+ if (err)
+ goto out_drop_write;
+
+ upperdentry = ovl_dentry_upper(dentry);
+ err = vfs_setxattr(upperdentry, name, value, size, flags);
+
+out_drop_write:
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+static bool ovl_need_xattr_filter(struct dentry *dentry,
+ enum ovl_path_type type)
+{
+ if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
+ return S_ISDIR(dentry->d_inode->i_mode);
+ else
+ return false;
+}
+
+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
+ void *value, size_t size)
+{
+ struct path realpath;
+ enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+
+ if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
+ return -ENODATA;
+
+ return vfs_getxattr(realpath.dentry, name, value, size);
+}
+
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+ struct path realpath;
+ enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+ ssize_t res;
+ int off;
+
+ res = vfs_listxattr(realpath.dentry, list, size);
+ if (res <= 0 || size == 0)
+ return res;
+
+ if (!ovl_need_xattr_filter(dentry, type))
+ return res;
+
+ /* filter out private xattrs */
+ for (off = 0; off < res;) {
+ char *s = list + off;
+ size_t slen = strlen(s) + 1;
+
+ BUG_ON(off + slen > res);
+
+ if (ovl_is_private_xattr(s)) {
+ res -= slen;
+ memmove(s, s + slen, res - off);
+ } else {
+ off += slen;
+ }
+ }
+
+ return res;
+}
+
+int ovl_removexattr(struct dentry *dentry, const char *name)
+{
+ int err;
+ struct path realpath;
+ enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ err = -ENODATA;
+ if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
+ goto out_drop_write;
+
+ if (!OVL_TYPE_UPPER(type)) {
+ err = vfs_getxattr(realpath.dentry, name, NULL, 0);
+ if (err < 0)
+ goto out_drop_write;
+
+ err = ovl_copy_up(dentry);
+ if (err)
+ goto out_drop_write;
+
+ ovl_path_upper(dentry, &realpath);
+ }
+
+ err = vfs_removexattr(realpath.dentry, name);
+out_drop_write:
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
+ struct dentry *realdentry)
+{
+ if (OVL_TYPE_UPPER(type))
+ return false;
+
+ if (special_file(realdentry->d_inode->i_mode))
+ return false;
+
+ if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
+ return false;
+
+ return true;
+}
+
+static int ovl_dentry_open(struct dentry *dentry, struct file *file,
+ const struct cred *cred)
+{
+ int err;
+ struct path realpath;
+ enum ovl_path_type type;
+ bool want_write = false;
+
+ type = ovl_path_real(dentry, &realpath);
+ if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
+ want_write = true;
+ err = ovl_want_write(dentry);
+ if (err)
+ goto out;
+
+ if (file->f_flags & O_TRUNC)
+ err = ovl_copy_up_last(dentry, NULL, true);
+ else
+ err = ovl_copy_up(dentry);
+ if (err)
+ goto out_drop_write;
+
+ ovl_path_upper(dentry, &realpath);
+ }
+
+ err = vfs_open(&realpath, file, cred);
+out_drop_write:
+ if (want_write)
+ ovl_drop_write(dentry);
+out:
+ return err;
+}
+
+static const struct inode_operations ovl_file_inode_operations = {
+ .setattr = ovl_setattr,
+ .permission = ovl_permission,
+ .getattr = ovl_getattr,
+ .setxattr = ovl_setxattr,
+ .getxattr = ovl_getxattr,
+ .listxattr = ovl_listxattr,
+ .removexattr = ovl_removexattr,
+ .dentry_open = ovl_dentry_open,
+};
+
+static const struct inode_operations ovl_symlink_inode_operations = {
+ .setattr = ovl_setattr,
+ .follow_link = ovl_follow_link,
+ .put_link = ovl_put_link,
+ .readlink = ovl_readlink,
+ .getattr = ovl_getattr,
+ .setxattr = ovl_setxattr,
+ .getxattr = ovl_getxattr,
+ .listxattr = ovl_listxattr,
+ .removexattr = ovl_removexattr,
+};
+
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
+ struct ovl_entry *oe)
+{
+ struct inode *inode;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+
+ mode &= S_IFMT;
+
+ inode->i_ino = get_next_ino();
+ inode->i_mode = mode;
+ inode->i_flags |= S_NOATIME | S_NOCMTIME;
+
+ switch (mode) {
+ case S_IFDIR:
+ inode->i_private = oe;
+ inode->i_op = &ovl_dir_inode_operations;
+ inode->i_fop = &ovl_dir_operations;
+ break;
+
+ case S_IFLNK:
+ inode->i_op = &ovl_symlink_inode_operations;
+ break;
+
+ case S_IFREG:
+ case S_IFSOCK:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFIFO:
+ inode->i_op = &ovl_file_inode_operations;
+ break;
+
+ default:
+ WARN(1, "illegal file type: %i\n", mode);
+ iput(inode);
+ inode = NULL;
+ }
+
+ return inode;
+}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
new file mode 100644
index 000000000000..17ac5afc9ffb
--- /dev/null
+++ b/fs/overlayfs/overlayfs.h
@@ -0,0 +1,199 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+
+struct ovl_entry;
+
+enum ovl_path_type {
+ __OVL_PATH_PURE = (1 << 0),
+ __OVL_PATH_UPPER = (1 << 1),
+ __OVL_PATH_MERGE = (1 << 2),
+};
+
+#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
+#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
+#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
+#define OVL_TYPE_MERGE_OR_LOWER(type) \
+ (OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
+
+#define OVL_XATTR_PRE_NAME "trusted.overlay."
+#define OVL_XATTR_PRE_LEN 16
+#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
+
+static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ int err = vfs_rmdir(dir, dentry);
+ pr_debug("rmdir(%pd2) = %i\n", dentry, err);
+ return err;
+}
+
+static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
+{
+ int err = vfs_unlink(dir, dentry, NULL);
+ pr_debug("unlink(%pd2) = %i\n", dentry, err);
+ return err;
+}
+
+static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry, bool debug)
+{
+ int err = vfs_link(old_dentry, dir, new_dentry, NULL);
+ if (debug) {
+ pr_debug("link(%pd2, %pd2) = %i\n",
+ old_dentry, new_dentry, err);
+ }
+ return err;
+}
+
+static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool debug)
+{
+ int err = vfs_create(dir, dentry, mode, true);
+ if (debug)
+ pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
+ return err;
+}
+
+static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool debug)
+{
+ int err = vfs_mkdir(dir, dentry, mode);
+ if (debug)
+ pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
+ return err;
+}
+
+static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
+ umode_t mode, dev_t dev, bool debug)
+{
+ int err = vfs_mknod(dir, dentry, mode, dev);
+ if (debug) {
+ pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
+ dentry, mode, dev, err);
+ }
+ return err;
+}
+
+static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
+ const char *oldname, bool debug)
+{
+ int err = vfs_symlink(dir, dentry, oldname);
+ if (debug)
+ pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
+ return err;
+}
+
+static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ int err = vfs_setxattr(dentry, name, value, size, flags);
+ pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
+ dentry, name, (int) size, (char *) value, flags, err);
+ return err;
+}
+
+static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
+{
+ int err = vfs_removexattr(dentry, name);
+ pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
+ return err;
+}
+
+static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
+ struct inode *newdir, struct dentry *newdentry,
+ unsigned int flags)
+{
+ int err;
+
+ pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
+ olddentry, newdentry, flags);
+
+ err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
+
+ if (err) {
+ pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
+ olddentry, newdentry, err);
+ }
+ return err;
+}
+
+static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
+{
+ int err = vfs_whiteout(dir, dentry);
+ pr_debug("whiteout(%pd2) = %i\n", dentry, err);
+ return err;
+}
+
+enum ovl_path_type ovl_path_type(struct dentry *dentry);
+u64 ovl_dentry_version_get(struct dentry *dentry);
+void ovl_dentry_version_inc(struct dentry *dentry);
+void ovl_path_upper(struct dentry *dentry, struct path *path);
+void ovl_path_lower(struct dentry *dentry, struct path *path);
+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
+struct dentry *ovl_dentry_upper(struct dentry *dentry);
+struct dentry *ovl_dentry_lower(struct dentry *dentry);
+struct dentry *ovl_dentry_real(struct dentry *dentry);
+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
+struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
+void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
+struct dentry *ovl_workdir(struct dentry *dentry);
+int ovl_want_write(struct dentry *dentry);
+void ovl_drop_write(struct dentry *dentry);
+bool ovl_dentry_is_opaque(struct dentry *dentry);
+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
+bool ovl_is_whiteout(struct dentry *dentry);
+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags);
+struct file *ovl_path_open(struct path *path, int flags);
+
+struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
+ struct kstat *stat, const char *link);
+
+/* readdir.c */
+extern const struct file_operations ovl_dir_operations;
+int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
+void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
+void ovl_cache_free(struct list_head *list);
+
+/* inode.c */
+int ovl_setattr(struct dentry *dentry, struct iattr *attr);
+int ovl_permission(struct inode *inode, int mask);
+int ovl_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags);
+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
+ void *value, size_t size);
+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
+int ovl_removexattr(struct dentry *dentry, const char *name);
+
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
+ struct ovl_entry *oe);
+static inline void ovl_copyattr(struct inode *from, struct inode *to)
+{
+ to->i_uid = from->i_uid;
+ to->i_gid = from->i_gid;
+}
+
+/* dir.c */
+extern const struct inode_operations ovl_dir_inode_operations;
+struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
+int ovl_create_real(struct inode *dir, struct dentry *newdentry,
+ struct kstat *stat, const char *link,
+ struct dentry *hardlink, bool debug);
+void ovl_cleanup(struct inode *dir, struct dentry *dentry);
+
+/* copy_up.c */
+int ovl_copy_up(struct dentry *dentry);
+int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
+ struct path *lowerpath, struct kstat *stat,
+ struct iattr *attr);
+int ovl_copy_xattr(struct dentry *old, struct dentry *new);
+int ovl_set_attr(struct dentry *upper, struct kstat *stat);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
new file mode 100644
index 000000000000..907870e81a72
--- /dev/null
+++ b/fs/overlayfs/readdir.c
@@ -0,0 +1,557 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/xattr.h>
+#include <linux/rbtree.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include "overlayfs.h"
+
+struct ovl_cache_entry {
+ unsigned int len;
+ unsigned int type;
+ u64 ino;
+ struct list_head l_node;
+ struct rb_node node;
+ bool is_whiteout;
+ char name[];
+};
+
+struct ovl_dir_cache {
+ long refcount;
+ u64 version;
+ struct list_head entries;
+};
+
+struct ovl_readdir_data {
+ struct dir_context ctx;
+ bool is_merge;
+ struct rb_root root;
+ struct list_head *list;
+ struct list_head middle;
+ struct dentry *dir;
+ int count;
+ int err;
+};
+
+struct ovl_dir_file {
+ bool is_real;
+ bool is_upper;
+ struct ovl_dir_cache *cache;
+ struct list_head *cursor;
+ struct file *realfile;
+ struct file *upperfile;
+};
+
+static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
+{
+ return container_of(n, struct ovl_cache_entry, node);
+}
+
+static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
+ const char *name, int len)
+{
+ struct rb_node *node = root->rb_node;
+ int cmp;
+
+ while (node) {
+ struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
+
+ cmp = strncmp(name, p->name, len);
+ if (cmp > 0)
+ node = p->node.rb_right;
+ else if (cmp < 0 || len < p->len)
+ node = p->node.rb_left;
+ else
+ return p;
+ }
+
+ return NULL;
+}
+
+static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
+ const char *name, int len,
+ u64 ino, unsigned int d_type)
+{
+ struct ovl_cache_entry *p;
+ size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
+
+ p = kmalloc(size, GFP_KERNEL);
+ if (!p)
+ return NULL;
+
+ memcpy(p->name, name, len);
+ p->name[len] = '\0';
+ p->len = len;
+ p->type = d_type;
+ p->ino = ino;
+ p->is_whiteout = false;
+
+ if (d_type == DT_CHR) {
+ struct dentry *dentry;
+ const struct cred *old_cred;
+ struct cred *override_cred;
+
+ override_cred = prepare_creds();
+ if (!override_cred) {
+ kfree(p);
+ return NULL;
+ }
+
+ /*
+ * CAP_DAC_OVERRIDE for lookup
+ */
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ old_cred = override_creds(override_cred);
+
+ dentry = lookup_one_len(name, dir, len);
+ if (!IS_ERR(dentry)) {
+ p->is_whiteout = ovl_is_whiteout(dentry);
+ dput(dentry);
+ }
+ revert_creds(old_cred);
+ put_cred(override_cred);
+ }
+ return p;
+}
+
+static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
+ const char *name, int len, u64 ino,
+ unsigned int d_type)
+{
+ struct rb_node **newp = &rdd->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct ovl_cache_entry *p;
+
+ while (*newp) {
+ int cmp;
+ struct ovl_cache_entry *tmp;
+
+ parent = *newp;
+ tmp = ovl_cache_entry_from_node(*newp);
+ cmp = strncmp(name, tmp->name, len);
+ if (cmp > 0)
+ newp = &tmp->node.rb_right;
+ else if (cmp < 0 || len < tmp->len)
+ newp = &tmp->node.rb_left;
+ else
+ return 0;
+ }
+
+ p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type);
+ if (p == NULL)
+ return -ENOMEM;
+
+ list_add_tail(&p->l_node, rdd->list);
+ rb_link_node(&p->node, parent, newp);
+ rb_insert_color(&p->node, &rdd->root);
+
+ return 0;
+}
+
+static int ovl_fill_lower(struct ovl_readdir_data *rdd,
+ const char *name, int namelen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct ovl_cache_entry *p;
+
+ p = ovl_cache_entry_find(&rdd->root, name, namelen);
+ if (p) {
+ list_move_tail(&p->l_node, &rdd->middle);
+ } else {
+ p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type);
+ if (p == NULL)
+ rdd->err = -ENOMEM;
+ else
+ list_add_tail(&p->l_node, &rdd->middle);
+ }
+
+ return rdd->err;
+}
+
+void ovl_cache_free(struct list_head *list)
+{
+ struct ovl_cache_entry *p;
+ struct ovl_cache_entry *n;
+
+ list_for_each_entry_safe(p, n, list, l_node)
+ kfree(p);
+
+ INIT_LIST_HEAD(list);
+}
+
+static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
+{
+ struct ovl_dir_cache *cache = od->cache;
+
+ WARN_ON(cache->refcount <= 0);
+ cache->refcount--;
+ if (!cache->refcount) {
+ if (ovl_dir_cache(dentry) == cache)
+ ovl_set_dir_cache(dentry, NULL);
+
+ ovl_cache_free(&cache->entries);
+ kfree(cache);
+ }
+}
+
+static int ovl_fill_merge(struct dir_context *ctx, const char *name,
+ int namelen, loff_t offset, u64 ino,
+ unsigned int d_type)
+{
+ struct ovl_readdir_data *rdd =
+ container_of(ctx, struct ovl_readdir_data, ctx);
+
+ rdd->count++;
+ if (!rdd->is_merge)
+ return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
+ else
+ return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
+}
+
+static inline int ovl_dir_read(struct path *realpath,
+ struct ovl_readdir_data *rdd)
+{
+ struct file *realfile;
+ int err;
+
+ realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
+ if (IS_ERR(realfile))
+ return PTR_ERR(realfile);
+
+ rdd->dir = realpath->dentry;
+ rdd->ctx.pos = 0;
+ do {
+ rdd->count = 0;
+ rdd->err = 0;
+ err = iterate_dir(realfile, &rdd->ctx);
+ if (err >= 0)
+ err = rdd->err;
+ } while (!err && rdd->count);
+ fput(realfile);
+
+ return err;
+}
+
+static void ovl_dir_reset(struct file *file)
+{
+ struct ovl_dir_file *od = file->private_data;
+ struct ovl_dir_cache *cache = od->cache;
+ struct dentry *dentry = file->f_path.dentry;
+ enum ovl_path_type type = ovl_path_type(dentry);
+
+ if (cache && ovl_dentry_version_get(dentry) != cache->version) {
+ ovl_cache_put(od, dentry);
+ od->cache = NULL;
+ od->cursor = NULL;
+ }
+ WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
+ if (od->is_real && OVL_TYPE_MERGE(type))
+ od->is_real = false;
+}
+
+static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
+{
+ int err;
+ struct path realpath;
+ struct ovl_readdir_data rdd = {
+ .ctx.actor = ovl_fill_merge,
+ .list = list,
+ .root = RB_ROOT,
+ .is_merge = false,
+ };
+ int idx, next;
+
+ for (idx = 0; idx != -1; idx = next) {
+ next = ovl_path_next(idx, dentry, &realpath);
+
+ if (next != -1) {
+ err = ovl_dir_read(&realpath, &rdd);
+ if (err)
+ break;
+ } else {
+ /*
+ * Insert lowest layer entries before upper ones, this
+ * allows offsets to be reasonably constant
+ */
+ list_add(&rdd.middle, rdd.list);
+ rdd.is_merge = true;
+ err = ovl_dir_read(&realpath, &rdd);
+ list_del(&rdd.middle);
+ }
+ }
+ return err;
+}
+
+static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
+{
+ struct list_head *p;
+ loff_t off = 0;
+
+ list_for_each(p, &od->cache->entries) {
+ if (off >= pos)
+ break;
+ off++;
+ }
+ /* Cursor is safe since the cache is stable */
+ od->cursor = p;
+}
+
+static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
+{
+ int res;
+ struct ovl_dir_cache *cache;
+
+ cache = ovl_dir_cache(dentry);
+ if (cache && ovl_dentry_version_get(dentry) == cache->version) {
+ cache->refcount++;
+ return cache;
+ }
+ ovl_set_dir_cache(dentry, NULL);
+
+ cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
+ if (!cache)
+ return ERR_PTR(-ENOMEM);
+
+ cache->refcount = 1;
+ INIT_LIST_HEAD(&cache->entries);
+
+ res = ovl_dir_read_merged(dentry, &cache->entries);
+ if (res) {
+ ovl_cache_free(&cache->entries);
+ kfree(cache);
+ return ERR_PTR(res);
+ }
+
+ cache->version = ovl_dentry_version_get(dentry);
+ ovl_set_dir_cache(dentry, cache);
+
+ return cache;
+}
+
+static int ovl_iterate(struct file *file, struct dir_context *ctx)
+{
+ struct ovl_dir_file *od = file->private_data;
+ struct dentry *dentry = file->f_path.dentry;
+ struct ovl_cache_entry *p;
+
+ if (!ctx->pos)
+ ovl_dir_reset(file);
+
+ if (od->is_real)
+ return iterate_dir(od->realfile, ctx);
+
+ if (!od->cache) {
+ struct ovl_dir_cache *cache;
+
+ cache = ovl_cache_get(dentry);
+ if (IS_ERR(cache))
+ return PTR_ERR(cache);
+
+ od->cache = cache;
+ ovl_seek_cursor(od, ctx->pos);
+ }
+
+ while (od->cursor != &od->cache->entries) {
+ p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
+ if (!p->is_whiteout)
+ if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
+ break;
+ od->cursor = p->l_node.next;
+ ctx->pos++;
+ }
+ return 0;
+}
+
+static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+ loff_t res;
+ struct ovl_dir_file *od = file->private_data;
+
+ mutex_lock(&file_inode(file)->i_mutex);
+ if (!file->f_pos)
+ ovl_dir_reset(file);
+
+ if (od->is_real) {
+ res = vfs_llseek(od->realfile, offset, origin);
+ file->f_pos = od->realfile->f_pos;
+ } else {
+ res = -EINVAL;
+
+ switch (origin) {
+ case SEEK_CUR:
+ offset += file->f_pos;
+ break;
+ case SEEK_SET:
+ break;
+ default:
+ goto out_unlock;
+ }
+ if (offset < 0)
+ goto out_unlock;
+
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ if (od->cache)
+ ovl_seek_cursor(od, offset);
+ }
+ res = offset;
+ }
+out_unlock:
+ mutex_unlock(&file_inode(file)->i_mutex);
+
+ return res;
+}
+
+static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
+{
+ struct ovl_dir_file *od = file->private_data;
+ struct dentry *dentry = file->f_path.dentry;
+ struct file *realfile = od->realfile;
+
+ /*
+ * Need to check if we started out being a lower dir, but got copied up
+ */
+ if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
+ struct inode *inode = file_inode(file);
+
+ realfile = lockless_dereference(od->upperfile);
+ if (!realfile) {
+ struct path upperpath;
+
+ ovl_path_upper(dentry, &upperpath);
+ realfile = ovl_path_open(&upperpath, O_RDONLY);
+ smp_mb__before_spinlock();
+ mutex_lock(&inode->i_mutex);
+ if (!od->upperfile) {
+ if (IS_ERR(realfile)) {
+ mutex_unlock(&inode->i_mutex);
+ return PTR_ERR(realfile);
+ }
+ od->upperfile = realfile;
+ } else {
+ /* somebody has beaten us to it */
+ if (!IS_ERR(realfile))
+ fput(realfile);
+ realfile = od->upperfile;
+ }
+ mutex_unlock(&inode->i_mutex);
+ }
+ }
+
+ return vfs_fsync_range(realfile, start, end, datasync);
+}
+
+static int ovl_dir_release(struct inode *inode, struct file *file)
+{
+ struct ovl_dir_file *od = file->private_data;
+
+ if (od->cache) {
+ mutex_lock(&inode->i_mutex);
+ ovl_cache_put(od, file->f_path.dentry);
+ mutex_unlock(&inode->i_mutex);
+ }
+ fput(od->realfile);
+ if (od->upperfile)
+ fput(od->upperfile);
+ kfree(od);
+
+ return 0;
+}
+
+static int ovl_dir_open(struct inode *inode, struct file *file)
+{
+ struct path realpath;
+ struct file *realfile;
+ struct ovl_dir_file *od;
+ enum ovl_path_type type;
+
+ od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
+ if (!od)
+ return -ENOMEM;
+
+ type = ovl_path_real(file->f_path.dentry, &realpath);
+ realfile = ovl_path_open(&realpath, file->f_flags);
+ if (IS_ERR(realfile)) {
+ kfree(od);
+ return PTR_ERR(realfile);
+ }
+ od->realfile = realfile;
+ od->is_real = !OVL_TYPE_MERGE(type);
+ od->is_upper = OVL_TYPE_UPPER(type);
+ file->private_data = od;
+
+ return 0;
+}
+
+const struct file_operations ovl_dir_operations = {
+ .read = generic_read_dir,
+ .open = ovl_dir_open,
+ .iterate = ovl_iterate,
+ .llseek = ovl_dir_llseek,
+ .fsync = ovl_dir_fsync,
+ .release = ovl_dir_release,
+};
+
+int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
+{
+ int err;
+ struct ovl_cache_entry *p;
+
+ err = ovl_dir_read_merged(dentry, list);
+ if (err)
+ return err;
+
+ err = 0;
+
+ list_for_each_entry(p, list, l_node) {
+ if (p->is_whiteout)
+ continue;
+
+ if (p->name[0] == '.') {
+ if (p->len == 1)
+ continue;
+ if (p->len == 2 && p->name[1] == '.')
+ continue;
+ }
+ err = -ENOTEMPTY;
+ break;
+ }
+
+ return err;
+}
+
+void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
+{
+ struct ovl_cache_entry *p;
+
+ mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
+ list_for_each_entry(p, list, l_node) {
+ struct dentry *dentry;
+
+ if (!p->is_whiteout)
+ continue;
+
+ dentry = lookup_one_len(p->name, upper, p->len);
+ if (IS_ERR(dentry)) {
+ pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
+ upper->d_name.name, p->len, p->name,
+ (int) PTR_ERR(dentry));
+ continue;
+ }
+ ovl_cleanup(upper->d_inode, dentry);
+ dput(dentry);
+ }
+ mutex_unlock(&upper->d_inode->i_mutex);
+}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
new file mode 100644
index 000000000000..5f0d1993e6e3
--- /dev/null
+++ b/fs/overlayfs/super.c
@@ -0,0 +1,1046 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include "overlayfs.h"
+
+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
+MODULE_DESCRIPTION("Overlay filesystem");
+MODULE_LICENSE("GPL");
+
+#define OVERLAYFS_SUPER_MAGIC 0x794c7630
+
+struct ovl_config {
+ char *lowerdir;
+ char *upperdir;
+ char *workdir;
+};
+
+/* private information held for overlayfs's superblock */
+struct ovl_fs {
+ struct vfsmount *upper_mnt;
+ unsigned numlower;
+ struct vfsmount **lower_mnt;
+ struct dentry *workdir;
+ long lower_namelen;
+ /* pathnames of lower and upper dirs, for show_options */
+ struct ovl_config config;
+};
+
+struct ovl_dir_cache;
+
+/* private information held for every overlayfs dentry */
+struct ovl_entry {
+ struct dentry *__upperdentry;
+ struct ovl_dir_cache *cache;
+ union {
+ struct {
+ u64 version;
+ bool opaque;
+ };
+ struct rcu_head rcu;
+ };
+ unsigned numlower;
+ struct path lowerstack[];
+};
+
+#define OVL_MAX_STACK 500
+
+static struct dentry *__ovl_dentry_lower(struct ovl_entry *oe)
+{
+ return oe->numlower ? oe->lowerstack[0].dentry : NULL;
+}
+
+enum ovl_path_type ovl_path_type(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ enum ovl_path_type type = 0;
+
+ if (oe->__upperdentry) {
+ type = __OVL_PATH_UPPER;
+
+ if (oe->numlower) {
+ if (S_ISDIR(dentry->d_inode->i_mode))
+ type |= __OVL_PATH_MERGE;
+ } else if (!oe->opaque) {
+ type |= __OVL_PATH_PURE;
+ }
+ } else {
+ if (oe->numlower > 1)
+ type |= __OVL_PATH_MERGE;
+ }
+ return type;
+}
+
+static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
+{
+ return lockless_dereference(oe->__upperdentry);
+}
+
+void ovl_path_upper(struct dentry *dentry, struct path *path)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ path->mnt = ofs->upper_mnt;
+ path->dentry = ovl_upperdentry_dereference(oe);
+}
+
+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
+{
+ enum ovl_path_type type = ovl_path_type(dentry);
+
+ if (!OVL_TYPE_UPPER(type))
+ ovl_path_lower(dentry, path);
+ else
+ ovl_path_upper(dentry, path);
+
+ return type;
+}
+
+struct dentry *ovl_dentry_upper(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return ovl_upperdentry_dereference(oe);
+}
+
+struct dentry *ovl_dentry_lower(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return __ovl_dentry_lower(oe);
+}
+
+struct dentry *ovl_dentry_real(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ struct dentry *realdentry;
+
+ realdentry = ovl_upperdentry_dereference(oe);
+ if (!realdentry)
+ realdentry = __ovl_dentry_lower(oe);
+
+ return realdentry;
+}
+
+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
+{
+ struct dentry *realdentry;
+
+ realdentry = ovl_upperdentry_dereference(oe);
+ if (realdentry) {
+ *is_upper = true;
+ } else {
+ realdentry = __ovl_dentry_lower(oe);
+ *is_upper = false;
+ }
+ return realdentry;
+}
+
+struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ return oe->cache;
+}
+
+void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ oe->cache = cache;
+}
+
+void ovl_path_lower(struct dentry *dentry, struct path *path)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ *path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL };
+}
+
+int ovl_want_write(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ return mnt_want_write(ofs->upper_mnt);
+}
+
+void ovl_drop_write(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ mnt_drop_write(ofs->upper_mnt);
+}
+
+struct dentry *ovl_workdir(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ return ofs->workdir;
+}
+
+bool ovl_dentry_is_opaque(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ return oe->opaque;
+}
+
+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ oe->opaque = opaque;
+}
+
+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
+ WARN_ON(oe->__upperdentry);
+ BUG_ON(!upperdentry->d_inode);
+ /*
+ * Make sure upperdentry is consistent before making it visible to
+ * ovl_upperdentry_dereference().
+ */
+ smp_wmb();
+ oe->__upperdentry = upperdentry;
+}
+
+void ovl_dentry_version_inc(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+ oe->version++;
+}
+
+u64 ovl_dentry_version_get(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+ return oe->version;
+}
+
+bool ovl_is_whiteout(struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+
+ return inode && IS_WHITEOUT(inode);
+}
+
+static bool ovl_is_opaquedir(struct dentry *dentry)
+{
+ int res;
+ char val;
+ struct inode *inode = dentry->d_inode;
+
+ if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr)
+ return false;
+
+ res = inode->i_op->getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1);
+ if (res == 1 && val == 'y')
+ return true;
+
+ return false;
+}
+
+static void ovl_dentry_release(struct dentry *dentry)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ if (oe) {
+ unsigned int i;
+
+ dput(oe->__upperdentry);
+ for (i = 0; i < oe->numlower; i++)
+ dput(oe->lowerstack[i].dentry);
+ kfree_rcu(oe, rcu);
+ }
+}
+
+static const struct dentry_operations ovl_dentry_operations = {
+ .d_release = ovl_dentry_release,
+};
+
+static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
+{
+ size_t size = offsetof(struct ovl_entry, lowerstack[numlower]);
+ struct ovl_entry *oe = kzalloc(size, GFP_KERNEL);
+
+ if (oe)
+ oe->numlower = numlower;
+
+ return oe;
+}
+
+static inline struct dentry *ovl_lookup_real(struct dentry *dir,
+ struct qstr *name)
+{
+ struct dentry *dentry;
+
+ mutex_lock(&dir->d_inode->i_mutex);
+ dentry = lookup_one_len(name->name, dir, name->len);
+ mutex_unlock(&dir->d_inode->i_mutex);
+
+ if (IS_ERR(dentry)) {
+ if (PTR_ERR(dentry) == -ENOENT)
+ dentry = NULL;
+ } else if (!dentry->d_inode) {
+ dput(dentry);
+ dentry = NULL;
+ }
+ return dentry;
+}
+
+/*
+ * Returns next layer in stack starting from top.
+ * Returns -1 if this is the last layer.
+ */
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+
+ BUG_ON(idx < 0);
+ if (idx == 0) {
+ ovl_path_upper(dentry, path);
+ if (path->dentry)
+ return oe->numlower ? 1 : -1;
+ idx++;
+ }
+ BUG_ON(idx > oe->numlower);
+ *path = oe->lowerstack[idx - 1];
+
+ return (idx < oe->numlower) ? idx + 1 : -1;
+}
+
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct ovl_entry *oe;
+ struct ovl_entry *poe = dentry->d_parent->d_fsdata;
+ struct path *stack = NULL;
+ struct dentry *upperdir, *upperdentry = NULL;
+ unsigned int ctr = 0;
+ struct inode *inode = NULL;
+ bool upperopaque = false;
+ struct dentry *this, *prev = NULL;
+ unsigned int i;
+ int err;
+
+ upperdir = ovl_upperdentry_dereference(poe);
+ if (upperdir) {
+ this = ovl_lookup_real(upperdir, &dentry->d_name);
+ err = PTR_ERR(this);
+ if (IS_ERR(this))
+ goto out;
+
+ if (this) {
+ if (ovl_is_whiteout(this)) {
+ dput(this);
+ this = NULL;
+ upperopaque = true;
+ } else if (poe->numlower && ovl_is_opaquedir(this)) {
+ upperopaque = true;
+ }
+ }
+ upperdentry = prev = this;
+ }
+
+ if (!upperopaque && poe->numlower) {
+ err = -ENOMEM;
+ stack = kcalloc(poe->numlower, sizeof(struct path), GFP_KERNEL);
+ if (!stack)
+ goto out_put_upper;
+ }
+
+ for (i = 0; !upperopaque && i < poe->numlower; i++) {
+ bool opaque = false;
+ struct path lowerpath = poe->lowerstack[i];
+
+ this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name);
+ err = PTR_ERR(this);
+ if (IS_ERR(this)) {
+ /*
+ * If it's positive, then treat ENAMETOOLONG as ENOENT.
+ */
+ if (err == -ENAMETOOLONG && (upperdentry || ctr))
+ continue;
+ goto out_put;
+ }
+ if (!this)
+ continue;
+ if (ovl_is_whiteout(this)) {
+ dput(this);
+ break;
+ }
+ /*
+ * Only makes sense to check opaque dir if this is not the
+ * lowermost layer.
+ */
+ if (i < poe->numlower - 1 && ovl_is_opaquedir(this))
+ opaque = true;
+
+ if (prev && (!S_ISDIR(prev->d_inode->i_mode) ||
+ !S_ISDIR(this->d_inode->i_mode))) {
+ /*
+ * FIXME: check for upper-opaqueness maybe better done
+ * in remove code.
+ */
+ if (prev == upperdentry)
+ upperopaque = true;
+ dput(this);
+ break;
+ }
+ /*
+ * If this is a non-directory then stop here.
+ */
+ if (!S_ISDIR(this->d_inode->i_mode))
+ opaque = true;
+
+ stack[ctr].dentry = this;
+ stack[ctr].mnt = lowerpath.mnt;
+ ctr++;
+ prev = this;
+ if (opaque)
+ break;
+ }
+
+ oe = ovl_alloc_entry(ctr);
+ err = -ENOMEM;
+ if (!oe)
+ goto out_put;
+
+ if (upperdentry || ctr) {
+ struct dentry *realdentry;
+
+ realdentry = upperdentry ? upperdentry : stack[0].dentry;
+
+ err = -ENOMEM;
+ inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
+ oe);
+ if (!inode)
+ goto out_free_oe;
+ ovl_copyattr(realdentry->d_inode, inode);
+ }
+
+ oe->opaque = upperopaque;
+ oe->__upperdentry = upperdentry;
+ memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr);
+ kfree(stack);
+ dentry->d_fsdata = oe;
+ d_add(dentry, inode);
+
+ return NULL;
+
+out_free_oe:
+ kfree(oe);
+out_put:
+ for (i = 0; i < ctr; i++)
+ dput(stack[i].dentry);
+ kfree(stack);
+out_put_upper:
+ dput(upperdentry);
+out:
+ return ERR_PTR(err);
+}
+
+struct file *ovl_path_open(struct path *path, int flags)
+{
+ return dentry_open(path, flags, current_cred());
+}
+
+static void ovl_put_super(struct super_block *sb)
+{
+ struct ovl_fs *ufs = sb->s_fs_info;
+ unsigned i;
+
+ dput(ufs->workdir);
+ mntput(ufs->upper_mnt);
+ for (i = 0; i < ufs->numlower; i++)
+ mntput(ufs->lower_mnt[i]);
+
+ kfree(ufs->config.lowerdir);
+ kfree(ufs->config.upperdir);
+ kfree(ufs->config.workdir);
+ kfree(ufs);
+}
+
+/**
+ * ovl_statfs
+ * @sb: The overlayfs super block
+ * @buf: The struct kstatfs to fill in with stats
+ *
+ * Get the filesystem statistics. As writes always target the upper layer
+ * filesystem pass the statfs to the upper filesystem (if it exists)
+ */
+static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct dentry *root_dentry = dentry->d_sb->s_root;
+ struct path path;
+ int err;
+
+ ovl_path_real(root_dentry, &path);
+
+ err = vfs_statfs(&path, buf);
+ if (!err) {
+ buf->f_namelen = max(buf->f_namelen, ofs->lower_namelen);
+ buf->f_type = OVERLAYFS_SUPER_MAGIC;
+ }
+
+ return err;
+}
+
+/**
+ * ovl_show_options
+ *
+ * Prints the mount options for a given superblock.
+ * Returns zero; does not fail.
+ */
+static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct ovl_fs *ufs = sb->s_fs_info;
+
+ seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
+ if (ufs->config.upperdir) {
+ seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
+ seq_printf(m, ",workdir=%s", ufs->config.workdir);
+ }
+ return 0;
+}
+
+static int ovl_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct ovl_fs *ufs = sb->s_fs_info;
+
+ if (!(*flags & MS_RDONLY) && !ufs->upper_mnt)
+ return -EROFS;
+
+ return 0;
+}
+
+static const struct super_operations ovl_super_operations = {
+ .put_super = ovl_put_super,
+ .statfs = ovl_statfs,
+ .show_options = ovl_show_options,
+ .remount_fs = ovl_remount,
+};
+
+enum {
+ OPT_LOWERDIR,
+ OPT_UPPERDIR,
+ OPT_WORKDIR,
+ OPT_ERR,
+};
+
+static const match_table_t ovl_tokens = {
+ {OPT_LOWERDIR, "lowerdir=%s"},
+ {OPT_UPPERDIR, "upperdir=%s"},
+ {OPT_WORKDIR, "workdir=%s"},
+ {OPT_ERR, NULL}
+};
+
+static char *ovl_next_opt(char **s)
+{
+ char *sbegin = *s;
+ char *p;
+
+ if (sbegin == NULL)
+ return NULL;
+
+ for (p = sbegin; *p; p++) {
+ if (*p == '\\') {
+ p++;
+ if (!*p)
+ break;
+ } else if (*p == ',') {
+ *p = '\0';
+ *s = p + 1;
+ return sbegin;
+ }
+ }
+ *s = NULL;
+ return sbegin;
+}
+
+static int ovl_parse_opt(char *opt, struct ovl_config *config)
+{
+ char *p;
+
+ while ((p = ovl_next_opt(&opt)) != NULL) {
+ int token;
+ substring_t args[MAX_OPT_ARGS];
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, ovl_tokens, args);
+ switch (token) {
+ case OPT_UPPERDIR:
+ kfree(config->upperdir);
+ config->upperdir = match_strdup(&args[0]);
+ if (!config->upperdir)
+ return -ENOMEM;
+ break;
+
+ case OPT_LOWERDIR:
+ kfree(config->lowerdir);
+ config->lowerdir = match_strdup(&args[0]);
+ if (!config->lowerdir)
+ return -ENOMEM;
+ break;
+
+ case OPT_WORKDIR:
+ kfree(config->workdir);
+ config->workdir = match_strdup(&args[0]);
+ if (!config->workdir)
+ return -ENOMEM;
+ break;
+
+ default:
+ pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p);
+ return -EINVAL;
+ }
+ }
+
+ /* Workdir is useless in non-upper mount */
+ if (!config->upperdir && config->workdir) {
+ pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n",
+ config->workdir);
+ kfree(config->workdir);
+ config->workdir = NULL;
+ }
+
+ return 0;
+}
+
+#define OVL_WORKDIR_NAME "work"
+
+static struct dentry *ovl_workdir_create(struct vfsmount *mnt,
+ struct dentry *dentry)
+{
+ struct inode *dir = dentry->d_inode;
+ struct dentry *work;
+ int err;
+ bool retried = false;
+
+ err = mnt_want_write(mnt);
+ if (err)
+ return ERR_PTR(err);
+
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+retry:
+ work = lookup_one_len(OVL_WORKDIR_NAME, dentry,
+ strlen(OVL_WORKDIR_NAME));
+
+ if (!IS_ERR(work)) {
+ struct kstat stat = {
+ .mode = S_IFDIR | 0,
+ };
+
+ if (work->d_inode) {
+ err = -EEXIST;
+ if (retried)
+ goto out_dput;
+
+ retried = true;
+ ovl_cleanup(dir, work);
+ dput(work);
+ goto retry;
+ }
+
+ err = ovl_create_real(dir, work, &stat, NULL, NULL, true);
+ if (err)
+ goto out_dput;
+ }
+out_unlock:
+ mutex_unlock(&dir->i_mutex);
+ mnt_drop_write(mnt);
+
+ return work;
+
+out_dput:
+ dput(work);
+ work = ERR_PTR(err);
+ goto out_unlock;
+}
+
+static void ovl_unescape(char *s)
+{
+ char *d = s;
+
+ for (;; s++, d++) {
+ if (*s == '\\')
+ s++;
+ *d = *s;
+ if (!*s)
+ break;
+ }
+}
+
+static bool ovl_is_allowed_fs_type(struct dentry *root)
+{
+ const struct dentry_operations *dop = root->d_op;
+
+ /*
+ * We don't support:
+ * - automount filesystems
+ * - filesystems with revalidate (FIXME for lower layer)
+ * - filesystems with case insensitive names
+ */
+ if (dop &&
+ (dop->d_manage || dop->d_automount ||
+ dop->d_revalidate || dop->d_weak_revalidate ||
+ dop->d_compare || dop->d_hash)) {
+ return false;
+ }
+ return true;
+}
+
+static int ovl_mount_dir_noesc(const char *name, struct path *path)
+{
+ int err = -EINVAL;
+
+ if (!*name) {
+ pr_err("overlayfs: empty lowerdir\n");
+ goto out;
+ }
+ err = kern_path(name, LOOKUP_FOLLOW, path);
+ if (err) {
+ pr_err("overlayfs: failed to resolve '%s': %i\n", name, err);
+ goto out;
+ }
+ err = -EINVAL;
+ if (!ovl_is_allowed_fs_type(path->dentry)) {
+ pr_err("overlayfs: filesystem on '%s' not supported\n", name);
+ goto out_put;
+ }
+ if (!S_ISDIR(path->dentry->d_inode->i_mode)) {
+ pr_err("overlayfs: '%s' not a directory\n", name);
+ goto out_put;
+ }
+ return 0;
+
+out_put:
+ path_put(path);
+out:
+ return err;
+}
+
+static int ovl_mount_dir(const char *name, struct path *path)
+{
+ int err = -ENOMEM;
+ char *tmp = kstrdup(name, GFP_KERNEL);
+
+ if (tmp) {
+ ovl_unescape(tmp);
+ err = ovl_mount_dir_noesc(tmp, path);
+ kfree(tmp);
+ }
+ return err;
+}
+
+static int ovl_lower_dir(const char *name, struct path *path, long *namelen,
+ int *stack_depth)
+{
+ int err;
+ struct kstatfs statfs;
+
+ err = ovl_mount_dir_noesc(name, path);
+ if (err)
+ goto out;
+
+ err = vfs_statfs(path, &statfs);
+ if (err) {
+ pr_err("overlayfs: statfs failed on '%s'\n", name);
+ goto out_put;
+ }
+ *namelen = max(*namelen, statfs.f_namelen);
+ *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
+
+ return 0;
+
+out_put:
+ path_put(path);
+out:
+ return err;
+}
+
+/* Workdir should not be subdir of upperdir and vice versa */
+static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
+{
+ bool ok = false;
+
+ if (workdir != upperdir) {
+ ok = (lock_rename(workdir, upperdir) == NULL);
+ unlock_rename(workdir, upperdir);
+ }
+ return ok;
+}
+
+static unsigned int ovl_split_lowerdirs(char *str)
+{
+ unsigned int ctr = 1;
+ char *s, *d;
+
+ for (s = d = str;; s++, d++) {
+ if (*s == '\\') {
+ s++;
+ } else if (*s == ':') {
+ *d = '\0';
+ ctr++;
+ continue;
+ }
+ *d = *s;
+ if (!*s)
+ break;
+ }
+ return ctr;
+}
+
+static int ovl_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct path upperpath = { NULL, NULL };
+ struct path workpath = { NULL, NULL };
+ struct dentry *root_dentry;
+ struct ovl_entry *oe;
+ struct ovl_fs *ufs;
+ struct path *stack = NULL;
+ char *lowertmp;
+ char *lower;
+ unsigned int numlower;
+ unsigned int stacklen = 0;
+ unsigned int i;
+ int err;
+
+ err = -ENOMEM;
+ ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
+ if (!ufs)
+ goto out;
+
+ err = ovl_parse_opt((char *) data, &ufs->config);
+ if (err)
+ goto out_free_config;
+
+ err = -EINVAL;
+ if (!ufs->config.lowerdir) {
+ pr_err("overlayfs: missing 'lowerdir'\n");
+ goto out_free_config;
+ }
+
+ sb->s_stack_depth = 0;
+ if (ufs->config.upperdir) {
+ if (!ufs->config.workdir) {
+ pr_err("overlayfs: missing 'workdir'\n");
+ goto out_free_config;
+ }
+
+ err = ovl_mount_dir(ufs->config.upperdir, &upperpath);
+ if (err)
+ goto out_free_config;
+
+ /* Upper fs should not be r/o */
+ if (upperpath.mnt->mnt_sb->s_flags & MS_RDONLY) {
+ pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n");
+ err = -EINVAL;
+ goto out_put_upperpath;
+ }
+
+ err = ovl_mount_dir(ufs->config.workdir, &workpath);
+ if (err)
+ goto out_put_upperpath;
+
+ err = -EINVAL;
+ if (upperpath.mnt != workpath.mnt) {
+ pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
+ goto out_put_workpath;
+ }
+ if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) {
+ pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
+ goto out_put_workpath;
+ }
+ sb->s_stack_depth = upperpath.mnt->mnt_sb->s_stack_depth;
+ }
+ err = -ENOMEM;
+ lowertmp = kstrdup(ufs->config.lowerdir, GFP_KERNEL);
+ if (!lowertmp)
+ goto out_put_workpath;
+
+ err = -EINVAL;
+ stacklen = ovl_split_lowerdirs(lowertmp);
+ if (stacklen > OVL_MAX_STACK) {
+ pr_err("overlayfs: too many lower directries, limit is %d\n",
+ OVL_MAX_STACK);
+ goto out_free_lowertmp;
+ } else if (!ufs->config.upperdir && stacklen == 1) {
+ pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n");
+ goto out_free_lowertmp;
+ }
+
+ stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL);
+ if (!stack)
+ goto out_free_lowertmp;
+
+ lower = lowertmp;
+ for (numlower = 0; numlower < stacklen; numlower++) {
+ err = ovl_lower_dir(lower, &stack[numlower],
+ &ufs->lower_namelen, &sb->s_stack_depth);
+ if (err)
+ goto out_put_lowerpath;
+
+ lower = strchr(lower, '\0') + 1;
+ }
+
+ err = -EINVAL;
+ sb->s_stack_depth++;
+ if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
+ pr_err("overlayfs: maximum fs stacking depth exceeded\n");
+ goto out_put_lowerpath;
+ }
+
+ if (ufs->config.upperdir) {
+ ufs->upper_mnt = clone_private_mount(&upperpath);
+ err = PTR_ERR(ufs->upper_mnt);
+ if (IS_ERR(ufs->upper_mnt)) {
+ pr_err("overlayfs: failed to clone upperpath\n");
+ goto out_put_lowerpath;
+ }
+
+ ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
+ err = PTR_ERR(ufs->workdir);
+ if (IS_ERR(ufs->workdir)) {
+ pr_err("overlayfs: failed to create directory %s/%s\n",
+ ufs->config.workdir, OVL_WORKDIR_NAME);
+ goto out_put_upper_mnt;
+ }
+ }
+
+ err = -ENOMEM;
+ ufs->lower_mnt = kcalloc(numlower, sizeof(struct vfsmount *), GFP_KERNEL);
+ if (ufs->lower_mnt == NULL)
+ goto out_put_workdir;
+ for (i = 0; i < numlower; i++) {
+ struct vfsmount *mnt = clone_private_mount(&stack[i]);
+
+ err = PTR_ERR(mnt);
+ if (IS_ERR(mnt)) {
+ pr_err("overlayfs: failed to clone lowerpath\n");
+ goto out_put_lower_mnt;
+ }
+ /*
+ * Make lower_mnt R/O. That way fchmod/fchown on lower file
+ * will fail instead of modifying lower fs.
+ */
+ mnt->mnt_flags |= MNT_READONLY;
+
+ ufs->lower_mnt[ufs->numlower] = mnt;
+ ufs->numlower++;
+ }
+
+ /* If the upper fs is nonexistent, we mark overlayfs r/o too */
+ if (!ufs->upper_mnt)
+ sb->s_flags |= MS_RDONLY;
+
+ sb->s_d_op = &ovl_dentry_operations;
+
+ err = -ENOMEM;
+ oe = ovl_alloc_entry(numlower);
+ if (!oe)
+ goto out_put_lower_mnt;
+
+ root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe));
+ if (!root_dentry)
+ goto out_free_oe;
+
+ mntput(upperpath.mnt);
+ for (i = 0; i < numlower; i++)
+ mntput(stack[i].mnt);
+ path_put(&workpath);
+ kfree(lowertmp);
+
+ oe->__upperdentry = upperpath.dentry;
+ for (i = 0; i < numlower; i++) {
+ oe->lowerstack[i].dentry = stack[i].dentry;
+ oe->lowerstack[i].mnt = ufs->lower_mnt[i];
+ }
+
+ root_dentry->d_fsdata = oe;
+
+ sb->s_magic = OVERLAYFS_SUPER_MAGIC;
+ sb->s_op = &ovl_super_operations;
+ sb->s_root = root_dentry;
+ sb->s_fs_info = ufs;
+
+ return 0;
+
+out_free_oe:
+ kfree(oe);
+out_put_lower_mnt:
+ for (i = 0; i < ufs->numlower; i++)
+ mntput(ufs->lower_mnt[i]);
+ kfree(ufs->lower_mnt);
+out_put_workdir:
+ dput(ufs->workdir);
+out_put_upper_mnt:
+ mntput(ufs->upper_mnt);
+out_put_lowerpath:
+ for (i = 0; i < numlower; i++)
+ path_put(&stack[i]);
+ kfree(stack);
+out_free_lowertmp:
+ kfree(lowertmp);
+out_put_workpath:
+ path_put(&workpath);
+out_put_upperpath:
+ path_put(&upperpath);
+out_free_config:
+ kfree(ufs->config.lowerdir);
+ kfree(ufs->config.upperdir);
+ kfree(ufs->config.workdir);
+ kfree(ufs);
+out:
+ return err;
+}
+
+static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *raw_data)
+{
+ return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
+}
+
+static struct file_system_type ovl_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "overlay",
+ .mount = ovl_mount,
+ .kill_sb = kill_anon_super,
+};
+MODULE_ALIAS_FS("overlay");
+
+static int __init ovl_init(void)
+{
+ return register_filesystem(&ovl_fs_type);
+}
+
+static void __exit ovl_exit(void)
+{
+ unregister_filesystem(&ovl_fs_type);
+}
+
+module_init(ovl_init);
+module_exit(ovl_exit);
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index ab30716584f5..239493ec718e 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -27,6 +27,5 @@ proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
proc-$(CONFIG_NET) += proc_net.o
proc-$(CONFIG_PROC_KCORE) += kcore.o
proc-$(CONFIG_PROC_VMCORE) += vmcore.o
-proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o
proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b9760628e1fd..489ba8caafc0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2555,6 +2555,57 @@ static const struct file_operations proc_projid_map_operations = {
.llseek = seq_lseek,
.release = proc_id_map_release,
};
+
+static int proc_setgroups_open(struct inode *inode, struct file *file)
+{
+ struct user_namespace *ns = NULL;
+ struct task_struct *task;
+ int ret;
+
+ ret = -ESRCH;
+ task = get_proc_task(inode);
+ if (task) {
+ rcu_read_lock();
+ ns = get_user_ns(task_cred_xxx(task, user_ns));
+ rcu_read_unlock();
+ put_task_struct(task);
+ }
+ if (!ns)
+ goto err;
+
+ if (file->f_mode & FMODE_WRITE) {
+ ret = -EACCES;
+ if (!ns_capable(ns, CAP_SYS_ADMIN))
+ goto err_put_ns;
+ }
+
+ ret = single_open(file, &proc_setgroups_show, ns);
+ if (ret)
+ goto err_put_ns;
+
+ return 0;
+err_put_ns:
+ put_user_ns(ns);
+err:
+ return ret;
+}
+
+static int proc_setgroups_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ int ret = single_release(inode, file);
+ put_user_ns(ns);
+ return ret;
+}
+
+static const struct file_operations proc_setgroups_operations = {
+ .open = proc_setgroups_open,
+ .write = proc_setgroups_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_setgroups_release,
+};
#endif /* CONFIG_USER_NS */
static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
@@ -2663,6 +2714,7 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
+ REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
#ifdef CONFIG_CHECKPOINT_RESTORE
REG("timers", S_IRUGO, proc_timers_operations),
@@ -2998,6 +3050,7 @@ static const struct pid_entry tid_base_stuff[] = {
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
+ REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
};
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index b7f268eb5f45..2e2d9d5d78d9 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -19,7 +19,6 @@
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/idr.h>
-#include <linux/namei.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
@@ -162,17 +161,6 @@ void proc_free_inum(unsigned int inum)
spin_unlock_irqrestore(&proc_inum_lock, flags);
}
-static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- nd_set_link(nd, __PDE_DATA(dentry->d_inode));
- return NULL;
-}
-
-static const struct inode_operations proc_link_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = proc_follow_link,
-};
-
/*
* Don't create negative dentries here, return -ENOENT by hand
* instead.
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 124fc43c7090..2f2815f3176e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -23,6 +23,7 @@
#include <linux/slab.h>
#include <linux/mount.h>
#include <linux/magic.h>
+#include <linux/namei.h>
#include <asm/uaccess.h>
@@ -401,6 +402,26 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
};
#endif
+static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct proc_dir_entry *pde = PDE(dentry->d_inode);
+ if (unlikely(!use_pde(pde)))
+ return ERR_PTR(-EINVAL);
+ nd_set_link(nd, pde->data);
+ return pde;
+}
+
+static void proc_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+{
+ unuse_pde(p);
+}
+
+const struct inode_operations proc_link_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = proc_follow_link,
+ .put_link = proc_put_link,
+};
+
struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
{
struct inode *inode = new_inode_pseudo(sb);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 651d09a11dde..8b24f3640cd9 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -202,6 +202,7 @@ struct pde_opener {
int closing;
struct completion *c;
};
+extern const struct inode_operations proc_link_inode_operations;
extern const struct inode_operations proc_pid_link_inode_operations;
@@ -211,13 +212,6 @@ extern int proc_fill_super(struct super_block *);
extern void proc_entry_rundown(struct proc_dir_entry *);
/*
- * proc_devtree.c
- */
-#ifdef CONFIG_PROC_DEVICETREE
-extern void proc_device_tree_init(void);
-#endif
-
-/*
* proc_namespaces.c
*/
extern const struct inode_operations proc_ns_dir_inode_operations;
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 9ae46b87470d..89026095f2b5 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -146,7 +146,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
struct task_struct *task;
void *ns;
char name[50];
- int len = -EACCES;
+ int res = -EACCES;
task = get_proc_task(inode);
if (!task)
@@ -155,24 +155,18 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto out_put_task;
- len = -ENOENT;
+ res = -ENOENT;
ns = ns_ops->get(task);
if (!ns)
goto out_put_task;
snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
- len = strlen(name);
-
- if (len > buflen)
- len = buflen;
- if (copy_to_user(buffer, name, len))
- len = -EFAULT;
-
+ res = readlink_copy(buffer, buflen, name);
ns_ops->put(ns);
out_put_task:
put_task_struct(task);
out:
- return len;
+ return res;
}
static const struct inode_operations proc_ns_link_inode_operations = {
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
deleted file mode 100644
index c82dd5147845..000000000000
--- a/fs/proc/proc_devtree.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * proc_devtree.c - handles /proc/device-tree
- *
- * Copyright 1997 Paul Mackerras
- */
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/time.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/printk.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/of.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <asm/uaccess.h>
-#include "internal.h"
-
-static inline void set_node_proc_entry(struct device_node *np,
- struct proc_dir_entry *de)
-{
- np->pde = de;
-}
-
-static struct proc_dir_entry *proc_device_tree;
-
-/*
- * Supply data on a read from /proc/device-tree/node/property.
- */
-static int property_proc_show(struct seq_file *m, void *v)
-{
- struct property *pp = m->private;
-
- seq_write(m, pp->value, pp->length);
- return 0;
-}
-
-static int property_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, property_proc_show, __PDE_DATA(inode));
-}
-
-static const struct file_operations property_proc_fops = {
- .owner = THIS_MODULE,
- .open = property_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-/*
- * For a node with a name like "gc@10", we make symlinks called "gc"
- * and "@10" to it.
- */
-
-/*
- * Add a property to a node
- */
-static struct proc_dir_entry *
-__proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
- const char *name)
-{
- struct proc_dir_entry *ent;
-
- /*
- * Unfortunately proc_register puts each new entry
- * at the beginning of the list. So we rearrange them.
- */
- ent = proc_create_data(name,
- strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR,
- de, &property_proc_fops, pp);
- if (ent == NULL)
- return NULL;
-
- if (!strncmp(name, "security-", 9))
- proc_set_size(ent, 0); /* don't leak number of password chars */
- else
- proc_set_size(ent, pp->length);
-
- return ent;
-}
-
-
-void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop)
-{
- __proc_device_tree_add_prop(pde, prop, prop->name);
-}
-
-void proc_device_tree_remove_prop(struct proc_dir_entry *pde,
- struct property *prop)
-{
- remove_proc_entry(prop->name, pde);
-}
-
-void proc_device_tree_update_prop(struct proc_dir_entry *pde,
- struct property *newprop,
- struct property *oldprop)
-{
- struct proc_dir_entry *ent;
-
- if (!oldprop) {
- proc_device_tree_add_prop(pde, newprop);
- return;
- }
-
- for (ent = pde->subdir; ent != NULL; ent = ent->next)
- if (ent->data == oldprop)
- break;
- if (ent == NULL) {
- pr_warn("device-tree: property \"%s\" does not exist\n",
- oldprop->name);
- } else {
- ent->data = newprop;
- ent->size = newprop->length;
- }
-}
-
-/*
- * Various dodgy firmware might give us nodes and/or properties with
- * conflicting names. That's generally ok, except for exporting via /proc,
- * so munge names here to ensure they're unique.
- */
-
-static int duplicate_name(struct proc_dir_entry *de, const char *name)
-{
- struct proc_dir_entry *ent;
- int found = 0;
-
- spin_lock(&proc_subdir_lock);
-
- for (ent = de->subdir; ent != NULL; ent = ent->next) {
- if (strcmp(ent->name, name) == 0) {
- found = 1;
- break;
- }
- }
-
- spin_unlock(&proc_subdir_lock);
-
- return found;
-}
-
-static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de,
- const char *name)
-{
- char *fixed_name;
- int fixup_len = strlen(name) + 2 + 1; /* name + #x + \0 */
- int i = 1, size;
-
-realloc:
- fixed_name = kmalloc(fixup_len, GFP_KERNEL);
- if (fixed_name == NULL) {
- pr_err("device-tree: Out of memory trying to fixup "
- "name \"%s\"\n", name);
- return name;
- }
-
-retry:
- size = snprintf(fixed_name, fixup_len, "%s#%d", name, i);
- size++; /* account for NULL */
-
- if (size > fixup_len) {
- /* We ran out of space, free and reallocate. */
- kfree(fixed_name);
- fixup_len = size;
- goto realloc;
- }
-
- if (duplicate_name(de, fixed_name)) {
- /* Multiple duplicates. Retry with a different offset. */
- i++;
- goto retry;
- }
-
- pr_warn("device-tree: Duplicate name in %s, renamed to \"%s\"\n",
- np->full_name, fixed_name);
-
- return fixed_name;
-}
-
-/*
- * Process a node, adding entries for its children and its properties.
- */
-void proc_device_tree_add_node(struct device_node *np,
- struct proc_dir_entry *de)
-{
- struct property *pp;
- struct proc_dir_entry *ent;
- struct device_node *child;
- const char *p;
-
- set_node_proc_entry(np, de);
- for (child = NULL; (child = of_get_next_child(np, child));) {
- /* Use everything after the last slash, or the full name */
- p = kbasename(child->full_name);
-
- if (duplicate_name(de, p))
- p = fixup_name(np, de, p);
-
- ent = proc_mkdir(p, de);
- if (ent == NULL)
- break;
- proc_device_tree_add_node(child, ent);
- }
- of_node_put(child);
-
- for (pp = np->properties; pp != NULL; pp = pp->next) {
- p = pp->name;
-
- if (strchr(p, '/'))
- continue;
-
- if (duplicate_name(de, p))
- p = fixup_name(np, de, p);
-
- ent = __proc_device_tree_add_prop(de, pp, p);
- if (ent == NULL)
- break;
- }
-}
-
-/*
- * Called on initialization to set up the /proc/device-tree subtree
- */
-void __init proc_device_tree_init(void)
-{
- struct device_node *root;
-
- proc_device_tree = proc_mkdir("device-tree", NULL);
- if (proc_device_tree == NULL)
- return;
- root = of_find_node_by_path("/");
- if (root == NULL) {
- remove_proc_entry("device-tree", NULL);
- pr_debug("/proc/device-tree: can't find root\n");
- return;
- }
- proc_device_tree_add_node(root, proc_device_tree);
- of_node_put(root);
-}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 87dbcbef7fe4..7bbeb5257af1 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -183,9 +183,6 @@ void __init proc_root_init(void)
proc_mkdir("openprom", NULL);
#endif
proc_tty_init();
-#ifdef CONFIG_PROC_DEVICETREE
- proc_device_tree_init();
-#endif
proc_mkdir("bus", NULL);
proc_sys_init();
}
diff --git a/fs/proc/self.c b/fs/proc/self.c
index ffeb202ec942..4348bb8907c2 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -16,7 +16,7 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
if (!tgid)
return -ENOENT;
sprintf(tmp, "%d", tgid);
- return vfs_readlink(dentry,buffer,buflen,tmp);
+ return readlink_copy(buffer, buflen, tmp);
}
static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 6f599c62f0cc..dbd027235440 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -159,7 +159,7 @@ static int show_stat(struct seq_file *p, void *v)
/* sum again ? it could be updated? */
for_each_irq_nr(j)
- seq_put_decimal_ull(p, ' ', kstat_irqs(j));
+ seq_put_decimal_ull(p, ' ', kstat_irqs_usr(j));
seq_printf(p,
"\nctxt %llu\n"
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8f788193e3d4..c2546717fc2b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,4 +1,5 @@
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/mount.h>
@@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
/*
* We remember last_addr rather than next_addr to hit with
- * mmap_cache most of the time. We have zero last_addr at
+ * vmacache most of the time. We have zero last_addr at
* the beginning and also after lseek. We will have -1 last_addr
* after the end of the vmas.
*/
@@ -1226,6 +1227,9 @@ out:
static int pagemap_open(struct inode *inode, struct file *file)
{
+ /* do not disclose physical addresses: attack vector */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
"to stop being page-shift some time soon. See the "
"linux/Documentation/vm/pagemap.txt for details.\n");
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 12823845d324..14120a3c6195 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -319,10 +319,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
compressed ? ".enc.z" : "");
break;
case PSTORE_TYPE_CONSOLE:
- sprintf(name, "console-%s", psname);
+ sprintf(name, "console-%s-%lld", psname, id);
break;
case PSTORE_TYPE_FTRACE:
- sprintf(name, "ftrace-%s", psname);
+ sprintf(name, "ftrace-%s-%lld", psname, id);
break;
case PSTORE_TYPE_MCE:
sprintf(name, "mce-%s-%lld", psname, id);
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index fa8cef2cca3a..fe68d8ac4d3d 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -61,6 +61,11 @@ module_param(mem_size, ulong, 0400);
MODULE_PARM_DESC(mem_size,
"size of reserved RAM used to store oops/panic logs");
+static unsigned int mem_type;
+module_param(mem_type, uint, 0600);
+MODULE_PARM_DESC(mem_type,
+ "set to 1 to try to use unbuffered memory (default 0)");
+
static int dump_oops = 1;
module_param(dump_oops, int, 0600);
MODULE_PARM_DESC(dump_oops,
@@ -79,6 +84,7 @@ struct ramoops_context {
struct persistent_ram_zone *fprz;
phys_addr_t phys_addr;
unsigned long size;
+ unsigned int memtype;
size_t record_size;
size_t console_size;
size_t ftrace_size;
@@ -86,6 +92,7 @@ struct ramoops_context {
struct persistent_ram_ecc_info ecc_info;
unsigned int max_dump_cnt;
unsigned int dump_write_cnt;
+ /* _read_cnt need clear on ramoops_pstore_open */
unsigned int dump_read_cnt;
unsigned int console_read_cnt;
unsigned int ftrace_read_cnt;
@@ -101,6 +108,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
cxt->dump_read_cnt = 0;
cxt->console_read_cnt = 0;
+ cxt->ftrace_read_cnt = 0;
return 0;
}
@@ -117,13 +125,15 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
return NULL;
prz = przs[i];
+ if (!prz)
+ return NULL;
- if (update) {
- /* Update old/shadowed buffer. */
+ /* Update old/shadowed buffer. */
+ if (update)
persistent_ram_save_old(prz);
- if (!persistent_ram_old_size(prz))
- return NULL;
- }
+
+ if (!persistent_ram_old_size(prz))
+ return NULL;
*typep = type;
*id = i;
@@ -353,7 +363,8 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
size_t sz = cxt->record_size;
cxt->przs[i] = persistent_ram_new(*paddr, sz, 0,
- &cxt->ecc_info);
+ &cxt->ecc_info,
+ cxt->memtype);
if (IS_ERR(cxt->przs[i])) {
err = PTR_ERR(cxt->przs[i]);
dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
@@ -383,7 +394,7 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
return -ENOMEM;
}
- *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info);
+ *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype);
if (IS_ERR(*prz)) {
int err = PTR_ERR(*prz);
@@ -428,9 +439,9 @@ static int ramoops_probe(struct platform_device *pdev)
if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
- cxt->dump_read_cnt = 0;
cxt->size = pdata->mem_size;
cxt->phys_addr = pdata->mem_address;
+ cxt->memtype = pdata->mem_type;
cxt->record_size = pdata->record_size;
cxt->console_size = pdata->console_size;
cxt->ftrace_size = pdata->ftrace_size;
@@ -561,6 +572,7 @@ static void ramoops_register_dummy(void)
dummy_data->mem_size = mem_size;
dummy_data->mem_address = mem_address;
+ dummy_data->mem_type = 0;
dummy_data->record_size = record_size;
dummy_data->console_size = ramoops_console_size;
dummy_data->ftrace_size = ramoops_ftrace_size;
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index de272d426763..bda61a759b68 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -380,7 +380,8 @@ void persistent_ram_zap(struct persistent_ram_zone *prz)
persistent_ram_update_header_ecc(prz);
}
-static void *persistent_ram_vmap(phys_addr_t start, size_t size)
+static void *persistent_ram_vmap(phys_addr_t start, size_t size,
+ unsigned int memtype)
{
struct page **pages;
phys_addr_t page_start;
@@ -392,7 +393,10 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size)
page_start = start - offset_in_page(start);
page_count = DIV_ROUND_UP(size + offset_in_page(start), PAGE_SIZE);
- prot = pgprot_noncached(PAGE_KERNEL);
+ if (memtype)
+ prot = pgprot_noncached(PAGE_KERNEL);
+ else
+ prot = pgprot_writecombine(PAGE_KERNEL);
pages = kmalloc(sizeof(struct page *) * page_count, GFP_KERNEL);
if (!pages) {
@@ -411,8 +415,11 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size)
return vaddr;
}
-static void *persistent_ram_iomap(phys_addr_t start, size_t size)
+static void *persistent_ram_iomap(phys_addr_t start, size_t size,
+ unsigned int memtype)
{
+ void *va;
+
if (!request_mem_region(start, size, "persistent_ram")) {
pr_err("request mem region (0x%llx@0x%llx) failed\n",
(unsigned long long)size, (unsigned long long)start);
@@ -422,19 +429,24 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size)
buffer_start_add = buffer_start_add_locked;
buffer_size_add = buffer_size_add_locked;
- return ioremap(start, size);
+ if (memtype)
+ va = ioremap(start, size);
+ else
+ va = ioremap_wc(start, size);
+
+ return va;
}
static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
- struct persistent_ram_zone *prz)
+ struct persistent_ram_zone *prz, int memtype)
{
prz->paddr = start;
prz->size = size;
if (pfn_valid(start >> PAGE_SHIFT))
- prz->vaddr = persistent_ram_vmap(start, size);
+ prz->vaddr = persistent_ram_vmap(start, size, memtype);
else
- prz->vaddr = persistent_ram_iomap(start, size);
+ prz->vaddr = persistent_ram_iomap(start, size, memtype);
if (!prz->vaddr) {
pr_err("%s: Failed to map 0x%llx pages at 0x%llx\n", __func__,
@@ -502,7 +514,8 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
}
struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
- u32 sig, struct persistent_ram_ecc_info *ecc_info)
+ u32 sig, struct persistent_ram_ecc_info *ecc_info,
+ unsigned int memtype)
{
struct persistent_ram_zone *prz;
int ret = -ENOMEM;
@@ -513,7 +526,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
goto err;
}
- ret = persistent_ram_buffer_map(start, size, prz);
+ ret = persistent_ram_buffer_map(start, size, prz, memtype);
if (ret)
goto err;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index ce87c9007b0f..89da95700c69 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -637,7 +637,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
dqstats_inc(DQST_LOOKUPS);
err = sb->dq_op->write_dquot(dquot);
if (!ret && err)
- err = ret;
+ ret = err;
dqput(dquot);
spin_lock(&dq_list_lock);
}
diff --git a/fs/readdir.c b/fs/readdir.c
index 5b53d995cae6..e21af53ec24e 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -72,10 +72,11 @@ struct readdir_callback {
int result;
};
-static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset,
- u64 ino, unsigned int d_type)
+static int fillonedir(struct dir_context *ctx, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type)
{
- struct readdir_callback *buf = (struct readdir_callback *) __buf;
+ struct readdir_callback *buf =
+ container_of(ctx, struct readdir_callback, ctx);
struct old_linux_dirent __user * dirent;
unsigned long d_ino;
@@ -146,11 +147,12 @@ struct getdents_callback {
int error;
};
-static int filldir(void * __buf, const char * name, int namlen, loff_t offset,
- u64 ino, unsigned int d_type)
+static int filldir(struct dir_context *ctx, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type)
{
struct linux_dirent __user * dirent;
- struct getdents_callback * buf = (struct getdents_callback *) __buf;
+ struct getdents_callback *buf =
+ container_of(ctx, struct getdents_callback, ctx);
unsigned long d_ino;
int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
sizeof(long));
@@ -230,11 +232,12 @@ struct getdents_callback64 {
int error;
};
-static int filldir64(void * __buf, const char * name, int namlen, loff_t offset,
- u64 ino, unsigned int d_type)
+static int filldir64(struct dir_context *ctx, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type)
{
struct linux_dirent64 __user *dirent;
- struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf;
+ struct getdents_callback64 *buf =
+ container_of(ctx, struct getdents_callback64, ctx);
int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
sizeof(u64));
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 5cdfbd638b5c..d8b7acfd5066 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -177,10 +177,11 @@ struct reiserfs_dentry_buf {
};
static int
-fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset,
- u64 ino, unsigned int d_type)
+fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
+ loff_t offset, u64 ino, unsigned int d_type)
{
- struct reiserfs_dentry_buf *dbuf = buf;
+ struct reiserfs_dentry_buf *dbuf =
+ container_of(ctx, struct reiserfs_dentry_buf, ctx);
struct dentry *dentry;
WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex));
@@ -794,10 +795,12 @@ struct listxattr_buf {
struct dentry *dentry;
};
-static int listxattr_filler(void *buf, const char *name, int namelen,
- loff_t offset, u64 ino, unsigned int d_type)
+static int listxattr_filler(struct dir_context *ctx, const char *name,
+ int namelen, loff_t offset, u64 ino,
+ unsigned int d_type)
{
- struct listxattr_buf *b = (struct listxattr_buf *)buf;
+ struct listxattr_buf *b =
+ container_of(ctx, struct listxattr_buf, ctx);
size_t size;
if (name[0] != '.' ||
(namelen != 1 && (name[1] != '.' || namelen != 2))) {
diff --git a/fs/splice.c b/fs/splice.c
index 12028fa41def..ffb92b9d34ba 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1327,6 +1327,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
return ret;
}
+EXPORT_SYMBOL(do_splice_direct);
static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
struct pipe_inode_info *opipe,
diff --git a/fs/super.c b/fs/super.c
index 7624267b2043..440ef51cd696 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -81,6 +81,8 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid);
dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid);
total_objects = dentries + inodes + fs_objects + 1;
+ if (!total_objects)
+ total_objects = 1;
/* proportion the scan between the caches */
dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
@@ -112,9 +114,14 @@ static unsigned long super_cache_count(struct shrinker *shrink,
sb = container_of(shrink, struct super_block, s_shrink);
- if (!grab_super_passive(sb))
- return 0;
-
+ /*
+ * Don't call grab_super_passive as it is a potential
+ * scalability bottleneck. The counts could get updated
+ * between super_cache_count and super_cache_scan anyway.
+ * Call to super_cache_count with shrinker_rwsem held
+ * ensures the safety of call to list_lru_count_node() and
+ * s_op->nr_cached_objects().
+ */
if (sb->s_op && sb->s_op->nr_cached_objects)
total_objects = sb->s_op->nr_cached_objects(sb,
sc->nid);
@@ -125,7 +132,6 @@ static unsigned long super_cache_count(struct shrinker *shrink,
sc->nid);
total_objects = vfs_pressure_ratio(total_objects);
- drop_super(sb);
return total_objects;
}
@@ -276,10 +282,8 @@ void deactivate_locked_super(struct super_block *s)
struct file_system_type *fs = s->s_type;
if (atomic_dec_and_test(&s->s_active)) {
cleancache_invalidate_fs(s);
- fs->kill_sb(s);
-
- /* caches are now gone, we can safely kill the shrinker now */
unregister_shrinker(&s->s_shrink);
+ fs->kill_sb(s);
put_filesystem(fs);
put_super(s);
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index ff8229340cd5..26b69b2d4a45 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -166,15 +166,10 @@ static int do_commit(struct ubifs_info *c)
err = ubifs_orphan_end_commit(c);
if (err)
goto out;
- old_ltail_lnum = c->ltail_lnum;
- err = ubifs_log_end_commit(c, new_ltail_lnum);
- if (err)
- goto out;
err = dbg_check_old_index(c, &zroot);
if (err)
goto out;
- mutex_lock(&c->mst_mutex);
c->mst_node->cmt_no = cpu_to_le64(c->cmt_no);
c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum);
c->mst_node->root_lnum = cpu_to_le32(zroot.lnum);
@@ -203,8 +198,9 @@ static int do_commit(struct ubifs_info *c)
c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
else
c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS);
- err = ubifs_write_master(c);
- mutex_unlock(&c->mst_mutex);
+
+ old_ltail_lnum = c->ltail_lnum;
+ err = ubifs_log_end_commit(c, new_ltail_lnum);
if (err)
goto out;
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index a902c5919e42..8d59de86dc9a 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -106,10 +106,14 @@ static inline long long empty_log_bytes(const struct ubifs_info *c)
h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs;
t = (long long)c->ltail_lnum * c->leb_size;
- if (h >= t)
+ if (h > t)
return c->log_bytes - h + t;
- else
+ else if (h != t)
return t - h;
+ else if (c->lhead_lnum != c->ltail_lnum)
+ return 0;
+ else
+ return c->log_bytes;
}
/**
@@ -447,9 +451,9 @@ out:
* @ltail_lnum: new log tail LEB number
*
* This function is called on when the commit operation was finished. It
- * moves log tail to new position and unmaps LEBs which contain obsolete data.
- * Returns zero in case of success and a negative error code in case of
- * failure.
+ * moves log tail to new position and updates the master node so that it stores
+ * the new log tail LEB number. Returns zero in case of success and a negative
+ * error code in case of failure.
*/
int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum)
{
@@ -477,7 +481,12 @@ int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum)
spin_unlock(&c->buds_lock);
err = dbg_check_bud_bytes(c);
+ if (err)
+ goto out;
+ err = ubifs_write_master(c);
+
+out:
mutex_unlock(&c->log_mutex);
return err;
}
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index ab83ace9910a..1a4bb9e8b3b8 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -352,10 +352,9 @@ int ubifs_read_master(struct ubifs_info *c)
* ubifs_write_master - write master node.
* @c: UBIFS file-system description object
*
- * This function writes the master node. The caller has to take the
- * @c->mst_mutex lock before calling this function. Returns zero in case of
- * success and a negative error code in case of failure. The master node is
- * written twice to enable recovery.
+ * This function writes the master node. Returns zero in case of success and a
+ * negative error code in case of failure. The master node is written twice to
+ * enable recovery.
*/
int ubifs_write_master(struct ubifs_info *c)
{
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 5ded8490c0c6..94d9a64287b7 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1957,7 +1957,6 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
mutex_init(&c->lp_mutex);
mutex_init(&c->tnc_mutex);
mutex_init(&c->log_mutex);
- mutex_init(&c->mst_mutex);
mutex_init(&c->umount_mutex);
mutex_init(&c->bu_mutex);
mutex_init(&c->write_reserve_mutex);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index e8c8cfe1435c..7ab9c710c749 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1042,7 +1042,6 @@ struct ubifs_debug_info;
*
* @mst_node: master node
* @mst_offs: offset of valid master node
- * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
*
* @max_bu_buf_len: maximum bulk-read buffer length
* @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
@@ -1282,7 +1281,6 @@ struct ubifs_info {
struct ubifs_mst_node *mst_node;
int mst_offs;
- struct mutex mst_mutex;
int max_bu_buf_len;
struct mutex bu_mutex;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 982ce05c87ed..287cd5f23421 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1271,13 +1271,22 @@ update_time:
return 0;
}
+/*
+ * Maximum length of linked list formed by ICB hierarchy. The chosen number is
+ * arbitrary - just that we hopefully don't limit any real use of rewritten
+ * inode on write-once media but avoid looping for too long on corrupted media.
+ */
+#define UDF_MAX_ICB_NESTING 1024
+
static void __udf_read_inode(struct inode *inode)
{
struct buffer_head *bh = NULL;
struct fileEntry *fe;
uint16_t ident;
struct udf_inode_info *iinfo = UDF_I(inode);
+ unsigned int indirections = 0;
+reread:
/*
* Set defaults, but the inode is still incomplete!
* Note: get_new_inode() sets the following on a new inode:
@@ -1314,28 +1323,26 @@ static void __udf_read_inode(struct inode *inode)
ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
&ident);
if (ident == TAG_IDENT_IE && ibh) {
- struct buffer_head *nbh = NULL;
struct kernel_lb_addr loc;
struct indirectEntry *ie;
ie = (struct indirectEntry *)ibh->b_data;
loc = lelb_to_cpu(ie->indirectICB.extLocation);
- if (ie->indirectICB.extLength &&
- (nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
- &ident))) {
- if (ident == TAG_IDENT_FE ||
- ident == TAG_IDENT_EFE) {
- memcpy(&iinfo->i_location,
- &loc,
- sizeof(struct kernel_lb_addr));
- brelse(bh);
- brelse(ibh);
- brelse(nbh);
- __udf_read_inode(inode);
+ if (ie->indirectICB.extLength) {
+ brelse(bh);
+ brelse(ibh);
+ memcpy(&iinfo->i_location, &loc,
+ sizeof(struct kernel_lb_addr));
+ if (++indirections > UDF_MAX_ICB_NESTING) {
+ udf_err(inode->i_sb,
+ "too many ICBs in ICB hierarchy"
+ " (max %d supported)\n",
+ UDF_MAX_ICB_NESTING);
+ make_bad_inode(inode);
return;
}
- brelse(nbh);
+ goto reread;
}
}
brelse(ibh);
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index d7c6dbe4194b..d89f324bc387 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -80,11 +80,17 @@ static int udf_symlink_filler(struct file *file, struct page *page)
struct inode *inode = page->mapping->host;
struct buffer_head *bh = NULL;
unsigned char *symlink;
- int err = -EIO;
+ int err;
unsigned char *p = kmap(page);
struct udf_inode_info *iinfo;
uint32_t pos;
+ /* We don't support symlinks longer than one block */
+ if (inode->i_size > inode->i_sb->s_blocksize) {
+ err = -ENAMETOOLONG;
+ goto out_unmap;
+ }
+
iinfo = UDF_I(inode);
pos = udf_block_map(inode, 0);
@@ -94,8 +100,10 @@ static int udf_symlink_filler(struct file *file, struct page *page)
} else {
bh = sb_bread(inode->i_sb, pos);
- if (!bh)
- goto out;
+ if (!bh) {
+ err = -EIO;
+ goto out_unlock_inode;
+ }
symlink = bh->b_data;
}
@@ -109,9 +117,10 @@ static int udf_symlink_filler(struct file *file, struct page *page)
unlock_page(page);
return 0;
-out:
+out_unlock_inode:
up_read(&iinfo->i_data_sem);
SetPageError(page);
+out_unmap:
kunmap(page);
unlock_page(page);
return err;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 5d2518b24cea..0461fbe405b7 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -434,10 +434,22 @@ xfs_start_page_writeback(
{
ASSERT(PageLocked(page));
ASSERT(!PageWriteback(page));
- if (clear_dirty)
+
+ /*
+ * if the page was not fully cleaned, we need to ensure that the higher
+ * layers come back to it correctly. That means we need to keep the page
+ * dirty, and for WB_SYNC_ALL writeback we need to ensure the
+ * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
+ * write this page in this writeback sweep will be made.
+ */
+ if (clear_dirty) {
clear_page_dirty_for_io(page);
- set_page_writeback(page);
+ set_page_writeback(page);
+ } else
+ set_page_writeback_keepwrite(page);
+
unlock_page(page);
+
/* If no buffers on the page are to be written, finish it here */
if (!buffers)
end_page_writeback(page);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 33149113e333..645f180f5960 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -319,6 +319,10 @@ xfs_buf_item_format(
ASSERT(atomic_read(&bip->bli_refcount) > 0);
ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
(bip->bli_flags & XFS_BLI_STALE));
+ ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
+ (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
+ && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
+
/*
* If it is an inode buffer, transfer the in-memory state to the
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3a137e9f9a7d..5d90b8db0ac7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1946,6 +1946,7 @@ xfs_iunlink(
agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
offset = offsetof(xfs_agi_t, agi_unlinked) +
(sizeof(xfs_agino_t) * bucket_index);
+ xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
xfs_trans_log_buf(tp, agibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
return 0;
@@ -2037,6 +2038,7 @@ xfs_iunlink_remove(
agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
offset = offsetof(xfs_agi_t, agi_unlinked) +
(sizeof(xfs_agino_t) * bucket_index);
+ xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
xfs_trans_log_buf(tp, agibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
} else {
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 78e62cc471c5..6152cbe353e8 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -271,32 +271,6 @@ xfs_open_by_handle(
return error;
}
-/*
- * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's
- * unused first argument.
- */
-STATIC int
-do_readlink(
- char __user *buffer,
- int buflen,
- const char *link)
-{
- int len;
-
- len = PTR_ERR(link);
- if (IS_ERR(link))
- goto out;
-
- len = strlen(link);
- if (len > (unsigned) buflen)
- len = buflen;
- if (copy_to_user(buffer, link, len))
- len = -EFAULT;
- out:
- return len;
-}
-
-
int
xfs_readlink_by_handle(
struct file *parfilp,
@@ -334,7 +308,7 @@ xfs_readlink_by_handle(
error = -xfs_readlink(XFS_I(dentry->d_inode), link);
if (error)
goto out_kfree;
- error = do_readlink(hreq->ohandle, olen, link);
+ error = readlink_copy(hreq->ohandle, olen, link);
if (error)
goto out_kfree;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c6ff3cf5a5bb..0eaaa2d296f0 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -321,7 +321,6 @@ reread:
* Initialize the mount structure from the superblock.
*/
xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
- xfs_sb_quota_from_disk(sbp);
/*
* If we haven't validated the superblock, do so now before we try
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 6d7d1de13403..1b271f59d518 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1108,6 +1108,11 @@ xfs_qm_reset_dqcounts(
*/
xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
"xfs_quotacheck");
+ /*
+ * Reset type in case we are reusing group quota file for
+ * project quotas or vice versa
+ */
+ ddq->d_flags = type;
ddq->d_bcount = 0;
ddq->d_icount = 0;
ddq->d_rtbcount = 0;
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index 1e116794bb66..4afd393846d3 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -397,10 +397,11 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp)
}
}
-void
-xfs_sb_from_disk(
+static void
+__xfs_sb_from_disk(
struct xfs_sb *to,
- xfs_dsb_t *from)
+ xfs_dsb_t *from,
+ bool convert_xquota)
{
to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
@@ -456,6 +457,17 @@ xfs_sb_from_disk(
to->sb_pad = 0;
to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
to->sb_lsn = be64_to_cpu(from->sb_lsn);
+ /* Convert on-disk flags to in-memory flags? */
+ if (convert_xquota)
+ xfs_sb_quota_from_disk(to);
+}
+
+void
+xfs_sb_from_disk(
+ struct xfs_sb *to,
+ xfs_dsb_t *from)
+{
+ __xfs_sb_from_disk(to, from, true);
}
static inline void
@@ -571,7 +583,11 @@ xfs_sb_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_sb sb;
- xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+ /*
+ * Use call variant which doesn't convert quota flags from disk
+ * format, because xfs_mount_validate_sb checks the on-disk flags.
+ */
+ __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false);
/*
* Only check the in progress field for the primary superblock as
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index c812c5c060de..b626f3db67cb 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -474,6 +474,7 @@ xfs_trans_apply_sb_deltas(
whole = 1;
}
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
if (whole)
/*
* Log the whole thing, the fields are noncontiguous.